ots 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,128 @@
1
+ /*
2
+ * highlighter
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ /*After the grader has graded the article and each
27
+ sentence has a score the highlighter will select
28
+ some of the sentences*/
29
+
30
+ static int
31
+ ots_highlight_max_line (OtsArticle * Doc)
32
+ {
33
+ GList *li;
34
+ int max = 0;
35
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
36
+ {
37
+ if (0 == (((OtsSentence *) li->data)->selected)) /* if not selected , count me in */
38
+ max = MAX (((OtsSentence *) li->data)->score, max);
39
+
40
+ }
41
+
42
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
43
+ {
44
+
45
+ if ((((OtsSentence *) li->data)->score == max) && (((OtsSentence *) li->data)->selected == 0)) /* if score==max && not selected before ,select me; */
46
+ {
47
+ ((OtsSentence *) li->data)->selected = 1;
48
+ return ((OtsSentence *) li->data)->wc;
49
+ }
50
+ }
51
+
52
+ return 0;
53
+ }
54
+
55
+
56
+ /* todo: impement this
57
+
58
+ void
59
+ ots_highlight_doc_wordcount (OtsArticle * Doc, int wordCount)
60
+
61
+ void
62
+ ots_highlight_doc_linecount (OtsArticle * Doc, int wordCount)
63
+
64
+
65
+
66
+ void
67
+ ots_highlight_doc_soft (OtsArticle * Doc, int percent) //blur selection by avrage of near sentences , will mark blocks
68
+ */
69
+
70
+ void
71
+ ots_highlight_doc (OtsArticle * Doc, int percent)
72
+ {
73
+ int i;
74
+ double ratio;
75
+ int wordCount;
76
+
77
+ if (0 == Doc->lineCount)
78
+ return;
79
+
80
+ if (percent > 100)
81
+ percent = 100;
82
+ else if (percent < 0)
83
+ percent = 0;
84
+
85
+ ratio = ((double) (percent)) / (100.0);
86
+
87
+ wordCount = ots_get_article_word_count (Doc);
88
+
89
+ for (i = 0; i < (ratio * (double) wordCount);)
90
+ {
91
+ i += ots_highlight_max_line (Doc);
92
+ }
93
+ }
94
+
95
+ void
96
+ ots_highlight_doc_lines (OtsArticle * Doc, int lines)
97
+ {
98
+ int i;
99
+ int lineCount;
100
+ int tmp;
101
+
102
+ if (0 == Doc->lineCount) return;
103
+
104
+ lineCount = Doc->lineCount;
105
+ i=0;
106
+ while ((i<lines)&&(i<lineCount))
107
+ {
108
+ i++;
109
+ tmp=ots_highlight_max_line (Doc);
110
+ }
111
+
112
+ }
113
+
114
+ void ots_highlight_doc_words (OtsArticle * Doc, int words)
115
+ {
116
+ int i;
117
+ int docWordCount;
118
+
119
+ if (0 == Doc->lineCount) return;
120
+
121
+ docWordCount = ots_get_article_word_count (Doc);
122
+
123
+ i=0;
124
+ while ((i < docWordCount) && (i <= words))
125
+ {
126
+ i += ots_highlight_max_line (Doc);
127
+ }
128
+ }
@@ -0,0 +1,131 @@
1
+ /*
2
+ * html.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ static unsigned char *
27
+ ots_get_line_HTML (const OtsSentence * aLine, size_t * out_size)
28
+ {
29
+ GList *li;
30
+ GString *text;
31
+ unsigned char *utf8_data;
32
+ char *score_str;
33
+ text = g_string_new (NULL);
34
+
35
+ score_str=g_new0(char,32);
36
+ sprintf(score_str,"<!--(%ld)-->",aLine->score);
37
+ g_string_append (text,score_str);
38
+ g_free(score_str);
39
+
40
+ if ((aLine->selected))
41
+ {
42
+ g_string_append (text,
43
+ "<FONT COLOR=\"#16569E\"><span style=\'background:yellow;\'>");
44
+ }
45
+ else
46
+ {
47
+ g_string_append (text, "<FONT COLOR=\"#16569E\"><span>");
48
+ }
49
+
50
+ for (li = (GList *) aLine->words; li != NULL; li = li->next)
51
+ {
52
+ if (0 == strcmp ((char *) li->data, "\n"))
53
+ g_string_append (text, "<br>");
54
+ else
55
+ g_string_append (text, (char *) li->data);
56
+ }
57
+ g_string_append (text,"</span></FONT>\n");
58
+
59
+ if (out_size)
60
+ *out_size = text->len;
61
+
62
+ utf8_data = text->str;
63
+ g_string_free (text, FALSE);
64
+
65
+ return utf8_data;
66
+ }
67
+
68
+
69
+ #if 0
70
+ static void
71
+ ots_print_line_HTML (FILE * stream, const OtsSentence * aLine)
72
+ {
73
+ unsigned char *utf8_txt;
74
+ size_t len;
75
+
76
+ utf8_txt = ots_get_line_HTML (aLine, &len);
77
+ fwrite (utf8_txt, 1, len, stream);
78
+ g_free (utf8_txt);
79
+ }
80
+ #endif
81
+
82
+
83
+ unsigned char *
84
+ ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len)
85
+ {
86
+ GList *li;
87
+ GString *text;
88
+ unsigned char *utf8_data;
89
+ size_t line_len;
90
+
91
+ text = g_string_new (NULL);
92
+
93
+
94
+ g_string_append (text,
95
+ "<html>\n<head>\n<title>OTS</title>\n<meta charset=\"utf-8\">\n</head>\n<body>\n");
96
+ g_string_append (text, "<!-- Generated by OpenTextSummarizer -->\n");
97
+ g_string_append (text, "<!--");
98
+ g_string_append (text, Doc->title);
99
+ g_string_append (text, "-->\n");
100
+
101
+
102
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
103
+ {
104
+ utf8_data = ots_get_line_HTML ((OtsSentence *) li->data, &line_len);
105
+ g_string_append_len (text, utf8_data, line_len);
106
+ g_free (utf8_data);
107
+ }
108
+ g_string_append (text, "</body></html>\n");
109
+
110
+ if (out_len)
111
+ *out_len = text->len;
112
+ utf8_data = text->str;
113
+
114
+ g_string_free (text, FALSE);
115
+ return utf8_data;
116
+
117
+ }
118
+
119
+
120
+
121
+ void
122
+ ots_print_HTML (FILE * stream, const OtsArticle * Doc)
123
+ {
124
+ unsigned char *utf8_txt;
125
+ size_t len;
126
+
127
+ utf8_txt = ots_get_doc_HTML (Doc, &len);
128
+ fwrite (utf8_txt, 1, len, stream);
129
+ g_free (utf8_txt);
130
+
131
+ }
@@ -0,0 +1,158 @@
1
+ /*
2
+ * libots.h
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #ifndef HAVE_LIBOTS_H
22
+ #define HAVE_LIBOTS_H
23
+
24
+ #include <glib.h>
25
+
26
+ G_BEGIN_DECLS
27
+
28
+ typedef struct
29
+ { /* the Term Frequency data structure */
30
+ char* word;
31
+ double tf; /*Also used for TF*/
32
+ } OtsWordTF;
33
+
34
+
35
+ typedef struct
36
+ {
37
+ /*a GList of char* */
38
+ GList *RemovePre; /* (a|b) replace string a with b */
39
+ GList *RemovePost;
40
+ GList *step1_pre;
41
+ GList *step1_post;
42
+
43
+ GList *synonyms;
44
+ GList *manual;
45
+
46
+ GList *ParserBreak;
47
+ GList *ParserDontBreak;
48
+
49
+
50
+ /*to be implemented*/
51
+ GList *ReplaceChars;
52
+
53
+ } OtsStemRule;
54
+
55
+
56
+ typedef struct
57
+ {
58
+ GList *words; /* a Glist of words (char*) */
59
+ glong score; /*score set by the grader*/
60
+ gboolean selected; /*is selected?*/
61
+ gint wc; /*word count*/
62
+ void *user_data; /*pointer to the original sentence , or serial number maybe*/
63
+ } OtsSentence;
64
+
65
+
66
+ typedef struct
67
+ {
68
+ GList *lines; /* a Glist of sentences (struct Sentence) */
69
+ gint lineCount; /*lines in the text*/
70
+ char *title; /*title , auto generated*/
71
+
72
+ OtsStemRule *stem; /*stemming & parsing rules*/
73
+
74
+ /*Term Frequency grader*/
75
+ GList *tf_terms;
76
+ GList *idf_terms;
77
+
78
+
79
+ /*Term Count grader*/
80
+ GList *dict; /* dictionary from xml*/
81
+ GList *wordStat; /* a wordlist of all words in the article and their occ */
82
+ GList *ImpWords; /*important words - for term count grader*/
83
+
84
+
85
+ } OtsArticle;
86
+
87
+
88
+ OtsArticle *ots_new_article (void);
89
+ void ots_free_article (OtsArticle *art);
90
+
91
+ /*parser*/
92
+ void ots_parse_file (FILE * stream, OtsArticle * Doc); /*file input */
93
+ void ots_parse_stream(const unsigned char *utf8 , size_t len ,OtsArticle *Doc); /*parse unicode stream*/
94
+
95
+ OtsSentence *ots_append_line (OtsArticle * Doc);
96
+ void ots_append_word (OtsSentence * aLine,unsigned const char *aWord);
97
+ void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
98
+
99
+
100
+ /*dictionary*/
101
+ gboolean ots_load_xml_dictionary (OtsArticle * Doc, const char *name);
102
+
103
+ int ots_get_article_word_count (const OtsArticle * Doc);
104
+
105
+
106
+ /*grader*/
107
+ void ots_highlight_doc (OtsArticle * Doc, int percent); /*example: 20%*/
108
+ void ots_highlight_doc_lines (OtsArticle * Doc, int lines); /*example: 10 lines*/
109
+ void ots_highlight_doc_words (OtsArticle * Doc, int words); /*example: 50 words*/
110
+
111
+ void ots_grade_doc (OtsArticle * Doc);
112
+
113
+ void ots_free_OtsWordTF(OtsWordTF *obj); /*todo: put in .h file*/
114
+ OtsWordTF* ots_new_OtsWordTF(const char* word,const double idf);
115
+
116
+
117
+ /*HTML output*/
118
+ void ots_print_HTML (FILE * stream, const OtsArticle * Doc);
119
+ unsigned char *ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len);
120
+
121
+ /*TEXT output*/
122
+ void ots_print_doc (FILE * stream, const OtsArticle * Doc);
123
+ unsigned char *ots_get_doc_text (const OtsArticle * Doc, size_t * out_len);
124
+
125
+
126
+ /*Plugin writing*/
127
+ unsigned char* ots_get_line_text (const OtsSentence *aLine, gboolean only_if_selected, size_t *out_size);
128
+ gboolean ots_is_line_selected(const OtsSentence *aLine);
129
+
130
+ /*Stemm support*/
131
+ OtsStemRule *new_stem_rule(void);
132
+ void free_stem_rule (OtsStemRule *rule);
133
+ unsigned char * ots_stem_strip (unsigned const char * aWord, const OtsStemRule *rule); /*returns newly allocated string with the root of the word*/
134
+ unsigned char *ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule); /*Remove leading spaces, comas, colons, etc. */
135
+
136
+ /*Relations between texts*/
137
+
138
+ /*Returns the number of topics that two blocks of text share*/
139
+ int ots_text_relations(
140
+ const unsigned char *text1,const unsigned char *lang_code1,
141
+ const unsigned char *text2,const unsigned char *lang_code2,const int topic_num);
142
+
143
+ /*For a given text, return the list of the topics*/
144
+ char* ots_text_topics(const unsigned char *text,const unsigned char *lang_code,int topic_num);
145
+
146
+
147
+ /*For a given text, return the list of the stemmed topics*/
148
+ GList* ots_text_stem_list(const unsigned char *text,const unsigned char *lang_code,int topic_num);
149
+
150
+
151
+ /*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
152
+ int ots_topic_list_score(const GList *topic_list1,const GList *topic_list2);
153
+
154
+ G_END_DECLS
155
+
156
+
157
+
158
+ #endif /* HAVE_LIBOTS_H */
data/ext/ots.c CHANGED
@@ -1,197 +1,176 @@
1
- #include <ruby.h>
1
+ #include "ots.h"
2
+ #include <sys/types.h>
3
+ #include <dirent.h>
4
+ #include <errno.h>
2
5
 
3
- /* ruby 1.9 only */
4
- #ifdef RUBY_VM
5
- #include <ruby/encoding.h>
6
- #endif
6
+ static VALUE mOTS, cArticle;
7
7
 
8
- #include <stdio.h>
9
- #include <stdlib.h>
10
- #include <string.h>
8
+ static void article_free(OtsArticle *article) {
9
+ if (article)
10
+ ots_free_article(article);
11
+ }
11
12
 
12
- #include <libots-1/ots/libots.h>
13
+ VALUE article_allocate(VALUE klass) {
14
+ OtsArticle *article = ots_new_article();
15
+ return Data_Wrap_Struct(klass, 0, article_free, article);
16
+ }
13
17
 
14
- #define ID_CONST_GET rb_intern("const_get")
15
- #define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
18
+ OtsArticle* article_handle(VALUE self) {
19
+ OtsArticle *article = 0;
20
+ Data_Get_Struct(self, OtsArticle, article);
21
+ if (!article)
22
+ rb_raise(rb_eArgError, "invalid OTS::Article instance");
23
+ return article;
24
+ }
16
25
 
17
- static VALUE rb_cOTS;
18
- static VALUE eLoadError;
19
- static VALUE eRuntimeError;
20
- static VALUE eArgumentError;
26
+ void article_load_dictionary(OtsArticle *article, char *name) {
27
+ if (!ots_load_xml_dictionary(article, name)) {
28
+ rb_raise(rb_eLoadError, "Could not find dictionary file: %s", name);
29
+ }
30
+ }
21
31
 
22
- typedef struct {
23
- gchar *word; /* the word */
24
- gchar *stem; /*stem of the word*/
25
- gint occ; /* how many times have we seen this word in the text? */
26
- } OtsWordEntery;
32
+ VALUE article_initialize(int argc, VALUE *argv, VALUE self) {
33
+ VALUE text, dictionary;
34
+ OtsArticle *article = article_handle(self);
27
35
 
36
+ rb_scan_args(argc, argv, "11", &text, &dictionary);
28
37
 
29
- /* helpers */
38
+ if (TYPE(text) != T_STRING)
39
+ rb_raise(rb_eArgError, "invalid +text+");
30
40
 
31
- OtsArticle* get_article(VALUE self, gboolean error_on_missing) {
32
- VALUE rb_article_object = rb_iv_get(self, "@article");
33
- if (rb_article_object == Qnil) {
34
- if (error_on_missing)
35
- rb_raise(eRuntimeError, "libots document not initialized properly. Did you forget to parse content ?");
41
+ if (NIL_P(dictionary))
42
+ article_load_dictionary(article, "en");
36
43
  else
37
- return NULL;
38
- }
39
- return (OtsArticle *)DATA_PTR(rb_article_object);
40
- }
44
+ article_load_dictionary(article, CSTRING(dictionary));
45
+
46
+ ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article);
47
+ ots_grade_doc(article);
41
48
 
42
- void rb_ots_free_article(VALUE self) {
43
- OtsArticle *article = DATA_PTR(rb_iv_get(self, "@article"));
44
- ots_free_article(article);
49
+ rb_iv_set(self, "@encoding", (VALUE)rb_enc_get(text));
50
+
51
+ return self;
45
52
  }
46
53
 
47
- VALUE rb_string(char *utf8) {
48
- VALUE str = rb_str_new(utf8, strlen(utf8));
49
54
 
50
- /* ruby 1.9 only - force bytestream to utf8 */
51
- #ifdef RUBY_VM
52
- rb_enc_associate(str, rb_to_encoding(rb_str_new2("UTF-8")));
53
- ENC_CODERANGE_CLEAR(str);
54
- #endif
55
+ VALUE article_summary(OtsArticle *article, rb_encoding *encoding) {
56
+ OtsSentence *sentence;
55
57
 
56
- return str;
57
- }
58
+ GList *line_ptr = article->lines;
59
+ VALUE summary = rb_ary_new();
58
60
 
59
- /* ruby libots methods/wrappers */
61
+ while (line_ptr != NULL) {
62
+ sentence = (OtsSentence *)line_ptr->data;
60
63
 
61
- VALUE rb_ots_init(VALUE self) {
62
- OtsArticle *article = get_article(self, FALSE);
63
- VALUE dict = Qnil;
64
- if (article != NULL) {
65
- dict = rb_iv_get(self, "@dict");
66
- ots_free_article(article);
67
- }
68
- article = ots_new_article();
69
- rb_iv_set(self, "@article", Data_Wrap_Struct(rb_cObject, 0, 0, article));
70
- rb_iv_set(self, "@dict", dict);
71
- return self;
72
- }
64
+ if (sentence->selected) {
65
+ size_t size;
66
+ unsigned char* content = ots_get_line_text(sentence, TRUE, &size);
73
67
 
74
- VALUE rb_ots_load_dictionary(VALUE self, VALUE dict) {
75
- char *dict_cstr = "en";
76
- if (dict != Qnil) dict_cstr = RSTRING_PTR(dict);
68
+ VALUE line = rb_hash_new();
69
+ rb_hash_aset(line, ID2SYM(rb_intern("sentence")), rb_enc_str_new((char *)content, size, encoding));
70
+ rb_hash_aset(line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
71
+ rb_ary_push(summary, line);
77
72
 
78
- OtsArticle *article = get_article(self, FALSE);
79
- if (article == NULL) {
80
- rb_ots_init(self);
81
- article = get_article(self, TRUE);
82
- }
73
+ // reset this so subsequent calls work right.
74
+ sentence->selected = FALSE;
75
+ }
83
76
 
84
- if (!ots_load_xml_dictionary(article, (unsigned const char *)dict_cstr)) {
85
- rb_ots_free_article(self);
86
- rb_raise(eLoadError, "Could not find dictionary file: %s", dict_cstr);
77
+ line_ptr = g_list_next(line_ptr);
87
78
  }
88
79
 
89
- rb_iv_set(self, "@dict", dict);
90
- return Qtrue;
80
+ return summary;
91
81
  }
92
82
 
93
- VALUE rb_ots_parse_string(VALUE self, VALUE string) {
94
- const unsigned char *string_cstr = (const unsigned char *)RSTRING_PTR(string);
95
- size_t string_len = RSTRING_LEN(string);
83
+ VALUE article_summarize(VALUE self, VALUE options) {
84
+ VALUE lines, percent;
85
+ OtsArticle *article = article_handle(self);
96
86
 
97
- rb_ots_init(self);
98
- rb_ots_load_dictionary(self, rb_iv_get(self, "@dict"));
99
- OtsArticle *article = get_article(self, TRUE);
100
- ots_parse_stream(string_cstr, string_len, article);
101
- ots_grade_doc(article);
102
- return Qtrue;
103
- }
87
+ if (TYPE(options) != T_HASH)
88
+ rb_raise(rb_eArgError, "expect an options hash");
104
89
 
105
- VALUE rb_ots_highlight_lines(VALUE self, int lines) {
106
- OtsArticle *article = get_article(self, TRUE);
107
- ots_highlight_doc_lines(article, lines);
108
- return Qtrue;
109
- }
90
+ lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
91
+ percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
92
+
93
+ if (NIL_P(lines) && NIL_P(percent))
94
+ rb_raise(rb_eArgError, "expect +lines+ or +percent+ to be provided");
110
95
 
111
- VALUE rb_ots_highlight_percent(VALUE self, int percent) {
112
- OtsArticle *article = get_article(self, TRUE);
113
- ots_highlight_doc(article, percent);
114
- return Qtrue;
96
+ if (lines != Qnil)
97
+ ots_highlight_doc_lines(article, NUM2INT(lines));
98
+ else
99
+ ots_highlight_doc(article, NUM2INT(percent));
100
+
101
+ return article_summary(article, (rb_encoding *)rb_iv_get(self, "@encoding"));
115
102
  }
116
103
 
117
- VALUE rb_ots_article_title(VALUE self) {
118
- OtsArticle *article = get_article(self, TRUE);
119
- if (article->title != NULL)
120
- return rb_string(article->title);
121
- else
122
- return Qnil;
104
+ VALUE article_title(VALUE self) {
105
+ OtsArticle *article = article_handle(self);
106
+ return (article->title ? rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")) : Qnil);
123
107
  }
124
108
 
125
- VALUE rb_ots_article_keywords(VALUE self) {
126
- OtsArticle *article = get_article(self, TRUE);
127
- GList* words = article->ImpWords;
128
- VALUE iwords = rb_ary_new();
129
- while (words != NULL) {
130
- OtsWordEntery *data = (OtsWordEntery *)words->data;
131
- if (data != NULL && strlen(data->word) > 0)
132
- rb_ary_push(iwords, rb_string(data->word));
133
- words = words->next;
134
- }
109
+ typedef struct {
110
+ gchar *word; /* the word */
111
+ gchar *stem; /*stem of the word*/
112
+ gint occ; /* how many times have we seen this word in the text? */
113
+ } OtsWordEntry;
135
114
 
136
- return iwords;
137
- }
138
115
 
139
- VALUE rb_ots_get_highlighted_lines(VALUE self) {
140
- OtsArticle *article = get_article(self, TRUE);
141
- OtsSentence *sentence;
142
- GList *curr_line = article->lines;
143
- VALUE hlt_lines = rb_ary_new();
116
+ VALUE article_keywords(VALUE self) {
117
+ OtsArticle *article = article_handle(self);
118
+ rb_encoding *encoding = (rb_encoding*)rb_iv_get(self, "@encoding");
144
119
 
145
- while (curr_line != NULL) {
146
- sentence = (OtsSentence *)curr_line->data;
147
- if (sentence->selected) {
148
- size_t len;
149
- unsigned char* content = ots_get_line_text(sentence, TRUE, &len);
150
- VALUE hlt_line = rb_hash_new();
151
- rb_hash_aset(hlt_line, ID2SYM(rb_intern("sentence")), rb_string((char *)content));
152
- rb_hash_aset(hlt_line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
153
- rb_ary_push(hlt_lines, hlt_line);
120
+ VALUE words = rb_ary_new();
121
+ GList* word_ptr = article->ImpWords;
122
+
123
+ while (word_ptr) {
124
+ OtsWordEntry *data = (OtsWordEntry *)word_ptr->data;
125
+ if (data && strlen(data->word) > 0)
126
+ rb_ary_push(words, rb_enc_str_new2(data->word, encoding));
127
+ word_ptr = word_ptr->next;
154
128
  }
155
- curr_line = g_list_next(curr_line);
156
- }
157
129
 
158
- return hlt_lines;
130
+ return words;
159
131
  }
160
132
 
161
- VALUE rb_summarize(VALUE self, VALUE options) {
162
-
163
- VALUE lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
164
- VALUE percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
133
+ VALUE ots_parse(int argc, VALUE *argv, VALUE self) {
134
+ VALUE article = article_allocate(cArticle);
135
+ article_initialize(argc, argv, article);
136
+ return article;
137
+ }
165
138
 
166
- if (lines != Qnil && percent != Qnil) {
167
- rb_ots_free_article(self);
168
- rb_raise(eArgumentError, "Cannot summarize on :lines & :percent, only one is allowed");
169
- }
170
- else if (lines == Qnil && percent == Qnil) {
171
- rb_ots_free_article(self);
172
- rb_raise(eArgumentError, "Need either :lines or :percent to summarize");
173
- }
139
+ VALUE ots_dictionaries(VALUE self) {
140
+ DIR *dir;
141
+ struct dirent *entry;
142
+ VALUE dictionaries = rb_ary_new();
143
+
144
+ if ((dir = opendir(DICTIONARY_DIR))) {
145
+ while ((entry = readdir(dir))) {
146
+ // entry->d_type is not portable.
147
+ if (strstr(entry->d_name, ".xml"))
148
+ rb_ary_push(dictionaries, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
149
+ }
150
+ }
151
+ else {
152
+ rb_raise(rb_eIOError, "unable to open dictionary directory: %s", strerror(errno));
153
+ }
174
154
 
175
- if (lines != Qnil)
176
- rb_ots_highlight_lines(self, FIX2INT(lines));
177
- else if (percent != Qnil)
178
- rb_ots_highlight_percent(self, FIX2INT(percent));
179
- return rb_ots_get_highlighted_lines(self);
155
+ closedir(dir);
156
+ return dictionaries;
180
157
  }
181
158
 
182
159
  /* init */
183
160
 
184
161
  void Init_ots(void) {
185
- eLoadError = CONST_GET(rb_mKernel, "LoadError");
186
- eRuntimeError = CONST_GET(rb_mKernel, "RuntimeError");
187
- eArgumentError = CONST_GET(rb_mKernel, "ArgumentError");
188
- rb_cOTS = rb_define_class("OTS", rb_cObject);
189
- rb_define_method(rb_cOTS, "load_dictionary", rb_ots_load_dictionary, 1);
190
- rb_define_method(rb_cOTS, "parse", rb_ots_parse_string, 1);
191
- rb_define_method(rb_cOTS, "highlight_lines", rb_ots_highlight_lines, 1);
192
- rb_define_method(rb_cOTS, "highlight_percent", rb_ots_highlight_percent, 1);
193
- rb_define_method(rb_cOTS, "highlighted_content", rb_ots_get_highlighted_lines, 0);
194
- rb_define_method(rb_cOTS, "summarize", rb_summarize, 1);
195
- rb_define_method(rb_cOTS, "title", rb_ots_article_title, 0);
196
- rb_define_method(rb_cOTS, "keywords", rb_ots_article_keywords, 0);
162
+ mOTS = rb_define_module("OTS");
163
+ cArticle = rb_define_class_under(mOTS, "Article", rb_cObject);
164
+
165
+ rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1);
166
+ rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1);
167
+ rb_define_method(cArticle, "title", RUBY_METHOD_FUNC(article_title), 0);
168
+ rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0);
169
+
170
+ rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
171
+ rb_define_module_function(mOTS, "dictionaries", RUBY_METHOD_FUNC(ots_dictionaries), 0);
172
+
173
+ rb_define_alloc_func(cArticle, article_allocate);
174
+
175
+ rb_define_const(mOTS, "VERSION", rb_str_new2(RUBY_OTS_VERSION));
197
176
  }