ots 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,128 @@
1
+ /*
2
+ * highlighter
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ /*After the grader has graded the article and each
27
+ sentence has a score the highlighter will select
28
+ some of the sentences*/
29
+
30
+ static int
31
+ ots_highlight_max_line (OtsArticle * Doc)
32
+ {
33
+ GList *li;
34
+ int max = 0;
35
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
36
+ {
37
+ if (0 == (((OtsSentence *) li->data)->selected)) /* if not selected , count me in */
38
+ max = MAX (((OtsSentence *) li->data)->score, max);
39
+
40
+ }
41
+
42
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
43
+ {
44
+
45
+ if ((((OtsSentence *) li->data)->score == max) && (((OtsSentence *) li->data)->selected == 0)) /* if score==max && not selected before ,select me; */
46
+ {
47
+ ((OtsSentence *) li->data)->selected = 1;
48
+ return ((OtsSentence *) li->data)->wc;
49
+ }
50
+ }
51
+
52
+ return 0;
53
+ }
54
+
55
+
56
+ /* todo: impement this
57
+
58
+ void
59
+ ots_highlight_doc_wordcount (OtsArticle * Doc, int wordCount)
60
+
61
+ void
62
+ ots_highlight_doc_linecount (OtsArticle * Doc, int wordCount)
63
+
64
+
65
+
66
+ void
67
+ ots_highlight_doc_soft (OtsArticle * Doc, int percent) //blur selection by avrage of near sentences , will mark blocks
68
+ */
69
+
70
+ void
71
+ ots_highlight_doc (OtsArticle * Doc, int percent)
72
+ {
73
+ int i;
74
+ double ratio;
75
+ int wordCount;
76
+
77
+ if (0 == Doc->lineCount)
78
+ return;
79
+
80
+ if (percent > 100)
81
+ percent = 100;
82
+ else if (percent < 0)
83
+ percent = 0;
84
+
85
+ ratio = ((double) (percent)) / (100.0);
86
+
87
+ wordCount = ots_get_article_word_count (Doc);
88
+
89
+ for (i = 0; i < (ratio * (double) wordCount);)
90
+ {
91
+ i += ots_highlight_max_line (Doc);
92
+ }
93
+ }
94
+
95
+ void
96
+ ots_highlight_doc_lines (OtsArticle * Doc, int lines)
97
+ {
98
+ int i;
99
+ int lineCount;
100
+ int tmp;
101
+
102
+ if (0 == Doc->lineCount) return;
103
+
104
+ lineCount = Doc->lineCount;
105
+ i=0;
106
+ while ((i<lines)&&(i<lineCount))
107
+ {
108
+ i++;
109
+ tmp=ots_highlight_max_line (Doc);
110
+ }
111
+
112
+ }
113
+
114
+ void ots_highlight_doc_words (OtsArticle * Doc, int words)
115
+ {
116
+ int i;
117
+ int docWordCount;
118
+
119
+ if (0 == Doc->lineCount) return;
120
+
121
+ docWordCount = ots_get_article_word_count (Doc);
122
+
123
+ i=0;
124
+ while ((i < docWordCount) && (i <= words))
125
+ {
126
+ i += ots_highlight_max_line (Doc);
127
+ }
128
+ }
@@ -0,0 +1,131 @@
1
+ /*
2
+ * html.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ static unsigned char *
27
+ ots_get_line_HTML (const OtsSentence * aLine, size_t * out_size)
28
+ {
29
+ GList *li;
30
+ GString *text;
31
+ unsigned char *utf8_data;
32
+ char *score_str;
33
+ text = g_string_new (NULL);
34
+
35
+ score_str=g_new0(char,32);
36
+ sprintf(score_str,"<!--(%ld)-->",aLine->score);
37
+ g_string_append (text,score_str);
38
+ g_free(score_str);
39
+
40
+ if ((aLine->selected))
41
+ {
42
+ g_string_append (text,
43
+ "<FONT COLOR=\"#16569E\"><span style=\'background:yellow;\'>");
44
+ }
45
+ else
46
+ {
47
+ g_string_append (text, "<FONT COLOR=\"#16569E\"><span>");
48
+ }
49
+
50
+ for (li = (GList *) aLine->words; li != NULL; li = li->next)
51
+ {
52
+ if (0 == strcmp ((char *) li->data, "\n"))
53
+ g_string_append (text, "<br>");
54
+ else
55
+ g_string_append (text, (char *) li->data);
56
+ }
57
+ g_string_append (text,"</span></FONT>\n");
58
+
59
+ if (out_size)
60
+ *out_size = text->len;
61
+
62
+ utf8_data = text->str;
63
+ g_string_free (text, FALSE);
64
+
65
+ return utf8_data;
66
+ }
67
+
68
+
69
+ #if 0
70
+ static void
71
+ ots_print_line_HTML (FILE * stream, const OtsSentence * aLine)
72
+ {
73
+ unsigned char *utf8_txt;
74
+ size_t len;
75
+
76
+ utf8_txt = ots_get_line_HTML (aLine, &len);
77
+ fwrite (utf8_txt, 1, len, stream);
78
+ g_free (utf8_txt);
79
+ }
80
+ #endif
81
+
82
+
83
+ unsigned char *
84
+ ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len)
85
+ {
86
+ GList *li;
87
+ GString *text;
88
+ unsigned char *utf8_data;
89
+ size_t line_len;
90
+
91
+ text = g_string_new (NULL);
92
+
93
+
94
+ g_string_append (text,
95
+ "<html>\n<head>\n<title>OTS</title>\n<meta charset=\"utf-8\">\n</head>\n<body>\n");
96
+ g_string_append (text, "<!-- Generated by OpenTextSummarizer -->\n");
97
+ g_string_append (text, "<!--");
98
+ g_string_append (text, Doc->title);
99
+ g_string_append (text, "-->\n");
100
+
101
+
102
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
103
+ {
104
+ utf8_data = ots_get_line_HTML ((OtsSentence *) li->data, &line_len);
105
+ g_string_append_len (text, utf8_data, line_len);
106
+ g_free (utf8_data);
107
+ }
108
+ g_string_append (text, "</body></html>\n");
109
+
110
+ if (out_len)
111
+ *out_len = text->len;
112
+ utf8_data = text->str;
113
+
114
+ g_string_free (text, FALSE);
115
+ return utf8_data;
116
+
117
+ }
118
+
119
+
120
+
121
+ void
122
+ ots_print_HTML (FILE * stream, const OtsArticle * Doc)
123
+ {
124
+ unsigned char *utf8_txt;
125
+ size_t len;
126
+
127
+ utf8_txt = ots_get_doc_HTML (Doc, &len);
128
+ fwrite (utf8_txt, 1, len, stream);
129
+ g_free (utf8_txt);
130
+
131
+ }
@@ -0,0 +1,158 @@
1
+ /*
2
+ * libots.h
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #ifndef HAVE_LIBOTS_H
22
+ #define HAVE_LIBOTS_H
23
+
24
+ #include <glib.h>
25
+
26
+ G_BEGIN_DECLS
27
+
28
+ typedef struct
29
+ { /* the Term Frequency data structure */
30
+ char* word;
31
+ double tf; /*Also used for TF*/
32
+ } OtsWordTF;
33
+
34
+
35
+ typedef struct
36
+ {
37
+ /*a GList of char* */
38
+ GList *RemovePre; /* (a|b) replace string a with b */
39
+ GList *RemovePost;
40
+ GList *step1_pre;
41
+ GList *step1_post;
42
+
43
+ GList *synonyms;
44
+ GList *manual;
45
+
46
+ GList *ParserBreak;
47
+ GList *ParserDontBreak;
48
+
49
+
50
+ /*to be implemented*/
51
+ GList *ReplaceChars;
52
+
53
+ } OtsStemRule;
54
+
55
+
56
+ typedef struct
57
+ {
58
+ GList *words; /* a Glist of words (char*) */
59
+ glong score; /*score set by the grader*/
60
+ gboolean selected; /*is selected?*/
61
+ gint wc; /*word count*/
62
+ void *user_data; /*pointer to the original sentence , or serial number maybe*/
63
+ } OtsSentence;
64
+
65
+
66
+ typedef struct
67
+ {
68
+ GList *lines; /* a Glist of sentences (struct Sentence) */
69
+ gint lineCount; /*lines in the text*/
70
+ char *title; /*title , auto generated*/
71
+
72
+ OtsStemRule *stem; /*stemming & parsing rules*/
73
+
74
+ /*Term Frequency grader*/
75
+ GList *tf_terms;
76
+ GList *idf_terms;
77
+
78
+
79
+ /*Term Count grader*/
80
+ GList *dict; /* dictionary from xml*/
81
+ GList *wordStat; /* a wordlist of all words in the article and their occ */
82
+ GList *ImpWords; /*important words - for term count grader*/
83
+
84
+
85
+ } OtsArticle;
86
+
87
+
88
+ OtsArticle *ots_new_article (void);
89
+ void ots_free_article (OtsArticle *art);
90
+
91
+ /*parser*/
92
+ void ots_parse_file (FILE * stream, OtsArticle * Doc); /*file input */
93
+ void ots_parse_stream(const unsigned char *utf8 , size_t len ,OtsArticle *Doc); /*parse unicode stream*/
94
+
95
+ OtsSentence *ots_append_line (OtsArticle * Doc);
96
+ void ots_append_word (OtsSentence * aLine,unsigned const char *aWord);
97
+ void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
98
+
99
+
100
+ /*dictionary*/
101
+ gboolean ots_load_xml_dictionary (OtsArticle * Doc, const char *name);
102
+
103
+ int ots_get_article_word_count (const OtsArticle * Doc);
104
+
105
+
106
+ /*grader*/
107
+ void ots_highlight_doc (OtsArticle * Doc, int percent); /*example: 20%*/
108
+ void ots_highlight_doc_lines (OtsArticle * Doc, int lines); /*example: 10 lines*/
109
+ void ots_highlight_doc_words (OtsArticle * Doc, int words); /*example: 50 words*/
110
+
111
+ void ots_grade_doc (OtsArticle * Doc);
112
+
113
+ void ots_free_OtsWordTF(OtsWordTF *obj); /*todo: put in .h file*/
114
+ OtsWordTF* ots_new_OtsWordTF(const char* word,const double idf);
115
+
116
+
117
+ /*HTML output*/
118
+ void ots_print_HTML (FILE * stream, const OtsArticle * Doc);
119
+ unsigned char *ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len);
120
+
121
+ /*TEXT output*/
122
+ void ots_print_doc (FILE * stream, const OtsArticle * Doc);
123
+ unsigned char *ots_get_doc_text (const OtsArticle * Doc, size_t * out_len);
124
+
125
+
126
+ /*Plugin writing*/
127
+ unsigned char* ots_get_line_text (const OtsSentence *aLine, gboolean only_if_selected, size_t *out_size);
128
+ gboolean ots_is_line_selected(const OtsSentence *aLine);
129
+
130
+ /*Stemm support*/
131
+ OtsStemRule *new_stem_rule(void);
132
+ void free_stem_rule (OtsStemRule *rule);
133
+ unsigned char * ots_stem_strip (unsigned const char * aWord, const OtsStemRule *rule); /*returns newly allocated string with the root of the word*/
134
+ unsigned char *ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule); /*Remove leading spaces, comas, colons, etc. */
135
+
136
+ /*Relations between texts*/
137
+
138
+ /*Returns the number of topics that two blocks of text share*/
139
+ int ots_text_relations(
140
+ const unsigned char *text1,const unsigned char *lang_code1,
141
+ const unsigned char *text2,const unsigned char *lang_code2,const int topic_num);
142
+
143
+ /*For a given text, return the list of the topics*/
144
+ char* ots_text_topics(const unsigned char *text,const unsigned char *lang_code,int topic_num);
145
+
146
+
147
+ /*For a given text, return the list of the stemmed topics*/
148
+ GList* ots_text_stem_list(const unsigned char *text,const unsigned char *lang_code,int topic_num);
149
+
150
+
151
+ /*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
152
+ int ots_topic_list_score(const GList *topic_list1,const GList *topic_list2);
153
+
154
+ G_END_DECLS
155
+
156
+
157
+
158
+ #endif /* HAVE_LIBOTS_H */
data/ext/ots.c CHANGED
@@ -1,197 +1,176 @@
1
- #include <ruby.h>
1
+ #include "ots.h"
2
+ #include <sys/types.h>
3
+ #include <dirent.h>
4
+ #include <errno.h>
2
5
 
3
- /* ruby 1.9 only */
4
- #ifdef RUBY_VM
5
- #include <ruby/encoding.h>
6
- #endif
6
+ static VALUE mOTS, cArticle;
7
7
 
8
- #include <stdio.h>
9
- #include <stdlib.h>
10
- #include <string.h>
8
+ static void article_free(OtsArticle *article) {
9
+ if (article)
10
+ ots_free_article(article);
11
+ }
11
12
 
12
- #include <libots-1/ots/libots.h>
13
+ VALUE article_allocate(VALUE klass) {
14
+ OtsArticle *article = ots_new_article();
15
+ return Data_Wrap_Struct(klass, 0, article_free, article);
16
+ }
13
17
 
14
- #define ID_CONST_GET rb_intern("const_get")
15
- #define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
18
+ OtsArticle* article_handle(VALUE self) {
19
+ OtsArticle *article = 0;
20
+ Data_Get_Struct(self, OtsArticle, article);
21
+ if (!article)
22
+ rb_raise(rb_eArgError, "invalid OTS::Article instance");
23
+ return article;
24
+ }
16
25
 
17
- static VALUE rb_cOTS;
18
- static VALUE eLoadError;
19
- static VALUE eRuntimeError;
20
- static VALUE eArgumentError;
26
+ void article_load_dictionary(OtsArticle *article, char *name) {
27
+ if (!ots_load_xml_dictionary(article, name)) {
28
+ rb_raise(rb_eLoadError, "Could not find dictionary file: %s", name);
29
+ }
30
+ }
21
31
 
22
- typedef struct {
23
- gchar *word; /* the word */
24
- gchar *stem; /*stem of the word*/
25
- gint occ; /* how many times have we seen this word in the text? */
26
- } OtsWordEntery;
32
+ VALUE article_initialize(int argc, VALUE *argv, VALUE self) {
33
+ VALUE text, dictionary;
34
+ OtsArticle *article = article_handle(self);
27
35
 
36
+ rb_scan_args(argc, argv, "11", &text, &dictionary);
28
37
 
29
- /* helpers */
38
+ if (TYPE(text) != T_STRING)
39
+ rb_raise(rb_eArgError, "invalid +text+");
30
40
 
31
- OtsArticle* get_article(VALUE self, gboolean error_on_missing) {
32
- VALUE rb_article_object = rb_iv_get(self, "@article");
33
- if (rb_article_object == Qnil) {
34
- if (error_on_missing)
35
- rb_raise(eRuntimeError, "libots document not initialized properly. Did you forget to parse content ?");
41
+ if (NIL_P(dictionary))
42
+ article_load_dictionary(article, "en");
36
43
  else
37
- return NULL;
38
- }
39
- return (OtsArticle *)DATA_PTR(rb_article_object);
40
- }
44
+ article_load_dictionary(article, CSTRING(dictionary));
45
+
46
+ ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article);
47
+ ots_grade_doc(article);
41
48
 
42
- void rb_ots_free_article(VALUE self) {
43
- OtsArticle *article = DATA_PTR(rb_iv_get(self, "@article"));
44
- ots_free_article(article);
49
+ rb_iv_set(self, "@encoding", (VALUE)rb_enc_get(text));
50
+
51
+ return self;
45
52
  }
46
53
 
47
- VALUE rb_string(char *utf8) {
48
- VALUE str = rb_str_new(utf8, strlen(utf8));
49
54
 
50
- /* ruby 1.9 only - force bytestream to utf8 */
51
- #ifdef RUBY_VM
52
- rb_enc_associate(str, rb_to_encoding(rb_str_new2("UTF-8")));
53
- ENC_CODERANGE_CLEAR(str);
54
- #endif
55
+ VALUE article_summary(OtsArticle *article, rb_encoding *encoding) {
56
+ OtsSentence *sentence;
55
57
 
56
- return str;
57
- }
58
+ GList *line_ptr = article->lines;
59
+ VALUE summary = rb_ary_new();
58
60
 
59
- /* ruby libots methods/wrappers */
61
+ while (line_ptr != NULL) {
62
+ sentence = (OtsSentence *)line_ptr->data;
60
63
 
61
- VALUE rb_ots_init(VALUE self) {
62
- OtsArticle *article = get_article(self, FALSE);
63
- VALUE dict = Qnil;
64
- if (article != NULL) {
65
- dict = rb_iv_get(self, "@dict");
66
- ots_free_article(article);
67
- }
68
- article = ots_new_article();
69
- rb_iv_set(self, "@article", Data_Wrap_Struct(rb_cObject, 0, 0, article));
70
- rb_iv_set(self, "@dict", dict);
71
- return self;
72
- }
64
+ if (sentence->selected) {
65
+ size_t size;
66
+ unsigned char* content = ots_get_line_text(sentence, TRUE, &size);
73
67
 
74
- VALUE rb_ots_load_dictionary(VALUE self, VALUE dict) {
75
- char *dict_cstr = "en";
76
- if (dict != Qnil) dict_cstr = RSTRING_PTR(dict);
68
+ VALUE line = rb_hash_new();
69
+ rb_hash_aset(line, ID2SYM(rb_intern("sentence")), rb_enc_str_new((char *)content, size, encoding));
70
+ rb_hash_aset(line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
71
+ rb_ary_push(summary, line);
77
72
 
78
- OtsArticle *article = get_article(self, FALSE);
79
- if (article == NULL) {
80
- rb_ots_init(self);
81
- article = get_article(self, TRUE);
82
- }
73
+ // reset this so subsequent calls work right.
74
+ sentence->selected = FALSE;
75
+ }
83
76
 
84
- if (!ots_load_xml_dictionary(article, (unsigned const char *)dict_cstr)) {
85
- rb_ots_free_article(self);
86
- rb_raise(eLoadError, "Could not find dictionary file: %s", dict_cstr);
77
+ line_ptr = g_list_next(line_ptr);
87
78
  }
88
79
 
89
- rb_iv_set(self, "@dict", dict);
90
- return Qtrue;
80
+ return summary;
91
81
  }
92
82
 
93
- VALUE rb_ots_parse_string(VALUE self, VALUE string) {
94
- const unsigned char *string_cstr = (const unsigned char *)RSTRING_PTR(string);
95
- size_t string_len = RSTRING_LEN(string);
83
+ VALUE article_summarize(VALUE self, VALUE options) {
84
+ VALUE lines, percent;
85
+ OtsArticle *article = article_handle(self);
96
86
 
97
- rb_ots_init(self);
98
- rb_ots_load_dictionary(self, rb_iv_get(self, "@dict"));
99
- OtsArticle *article = get_article(self, TRUE);
100
- ots_parse_stream(string_cstr, string_len, article);
101
- ots_grade_doc(article);
102
- return Qtrue;
103
- }
87
+ if (TYPE(options) != T_HASH)
88
+ rb_raise(rb_eArgError, "expect an options hash");
104
89
 
105
- VALUE rb_ots_highlight_lines(VALUE self, int lines) {
106
- OtsArticle *article = get_article(self, TRUE);
107
- ots_highlight_doc_lines(article, lines);
108
- return Qtrue;
109
- }
90
+ lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
91
+ percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
92
+
93
+ if (NIL_P(lines) && NIL_P(percent))
94
+ rb_raise(rb_eArgError, "expect +lines+ or +percent+ to be provided");
110
95
 
111
- VALUE rb_ots_highlight_percent(VALUE self, int percent) {
112
- OtsArticle *article = get_article(self, TRUE);
113
- ots_highlight_doc(article, percent);
114
- return Qtrue;
96
+ if (lines != Qnil)
97
+ ots_highlight_doc_lines(article, NUM2INT(lines));
98
+ else
99
+ ots_highlight_doc(article, NUM2INT(percent));
100
+
101
+ return article_summary(article, (rb_encoding *)rb_iv_get(self, "@encoding"));
115
102
  }
116
103
 
117
- VALUE rb_ots_article_title(VALUE self) {
118
- OtsArticle *article = get_article(self, TRUE);
119
- if (article->title != NULL)
120
- return rb_string(article->title);
121
- else
122
- return Qnil;
104
+ VALUE article_title(VALUE self) {
105
+ OtsArticle *article = article_handle(self);
106
+ return (article->title ? rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")) : Qnil);
123
107
  }
124
108
 
125
- VALUE rb_ots_article_keywords(VALUE self) {
126
- OtsArticle *article = get_article(self, TRUE);
127
- GList* words = article->ImpWords;
128
- VALUE iwords = rb_ary_new();
129
- while (words != NULL) {
130
- OtsWordEntery *data = (OtsWordEntery *)words->data;
131
- if (data != NULL && strlen(data->word) > 0)
132
- rb_ary_push(iwords, rb_string(data->word));
133
- words = words->next;
134
- }
109
+ typedef struct {
110
+ gchar *word; /* the word */
111
+ gchar *stem; /*stem of the word*/
112
+ gint occ; /* how many times have we seen this word in the text? */
113
+ } OtsWordEntry;
135
114
 
136
- return iwords;
137
- }
138
115
 
139
- VALUE rb_ots_get_highlighted_lines(VALUE self) {
140
- OtsArticle *article = get_article(self, TRUE);
141
- OtsSentence *sentence;
142
- GList *curr_line = article->lines;
143
- VALUE hlt_lines = rb_ary_new();
116
+ VALUE article_keywords(VALUE self) {
117
+ OtsArticle *article = article_handle(self);
118
+ rb_encoding *encoding = (rb_encoding*)rb_iv_get(self, "@encoding");
144
119
 
145
- while (curr_line != NULL) {
146
- sentence = (OtsSentence *)curr_line->data;
147
- if (sentence->selected) {
148
- size_t len;
149
- unsigned char* content = ots_get_line_text(sentence, TRUE, &len);
150
- VALUE hlt_line = rb_hash_new();
151
- rb_hash_aset(hlt_line, ID2SYM(rb_intern("sentence")), rb_string((char *)content));
152
- rb_hash_aset(hlt_line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
153
- rb_ary_push(hlt_lines, hlt_line);
120
+ VALUE words = rb_ary_new();
121
+ GList* word_ptr = article->ImpWords;
122
+
123
+ while (word_ptr) {
124
+ OtsWordEntry *data = (OtsWordEntry *)word_ptr->data;
125
+ if (data && strlen(data->word) > 0)
126
+ rb_ary_push(words, rb_enc_str_new2(data->word, encoding));
127
+ word_ptr = word_ptr->next;
154
128
  }
155
- curr_line = g_list_next(curr_line);
156
- }
157
129
 
158
- return hlt_lines;
130
+ return words;
159
131
  }
160
132
 
161
- VALUE rb_summarize(VALUE self, VALUE options) {
162
-
163
- VALUE lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
164
- VALUE percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
133
+ VALUE ots_parse(int argc, VALUE *argv, VALUE self) {
134
+ VALUE article = article_allocate(cArticle);
135
+ article_initialize(argc, argv, article);
136
+ return article;
137
+ }
165
138
 
166
- if (lines != Qnil && percent != Qnil) {
167
- rb_ots_free_article(self);
168
- rb_raise(eArgumentError, "Cannot summarize on :lines & :percent, only one is allowed");
169
- }
170
- else if (lines == Qnil && percent == Qnil) {
171
- rb_ots_free_article(self);
172
- rb_raise(eArgumentError, "Need either :lines or :percent to summarize");
173
- }
139
+ VALUE ots_dictionaries(VALUE self) {
140
+ DIR *dir;
141
+ struct dirent *entry;
142
+ VALUE dictionaries = rb_ary_new();
143
+
144
+ if ((dir = opendir(DICTIONARY_DIR))) {
145
+ while ((entry = readdir(dir))) {
146
+ // entry->d_type is not portable.
147
+ if (strstr(entry->d_name, ".xml"))
148
+ rb_ary_push(dictionaries, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
149
+ }
150
+ }
151
+ else {
152
+ rb_raise(rb_eIOError, "unable to open dictionary directory: %s", strerror(errno));
153
+ }
174
154
 
175
- if (lines != Qnil)
176
- rb_ots_highlight_lines(self, FIX2INT(lines));
177
- else if (percent != Qnil)
178
- rb_ots_highlight_percent(self, FIX2INT(percent));
179
- return rb_ots_get_highlighted_lines(self);
155
+ closedir(dir);
156
+ return dictionaries;
180
157
  }
181
158
 
182
159
  /* init */
183
160
 
184
161
  void Init_ots(void) {
185
- eLoadError = CONST_GET(rb_mKernel, "LoadError");
186
- eRuntimeError = CONST_GET(rb_mKernel, "RuntimeError");
187
- eArgumentError = CONST_GET(rb_mKernel, "ArgumentError");
188
- rb_cOTS = rb_define_class("OTS", rb_cObject);
189
- rb_define_method(rb_cOTS, "load_dictionary", rb_ots_load_dictionary, 1);
190
- rb_define_method(rb_cOTS, "parse", rb_ots_parse_string, 1);
191
- rb_define_method(rb_cOTS, "highlight_lines", rb_ots_highlight_lines, 1);
192
- rb_define_method(rb_cOTS, "highlight_percent", rb_ots_highlight_percent, 1);
193
- rb_define_method(rb_cOTS, "highlighted_content", rb_ots_get_highlighted_lines, 0);
194
- rb_define_method(rb_cOTS, "summarize", rb_summarize, 1);
195
- rb_define_method(rb_cOTS, "title", rb_ots_article_title, 0);
196
- rb_define_method(rb_cOTS, "keywords", rb_ots_article_keywords, 0);
162
+ mOTS = rb_define_module("OTS");
163
+ cArticle = rb_define_class_under(mOTS, "Article", rb_cObject);
164
+
165
+ rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1);
166
+ rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1);
167
+ rb_define_method(cArticle, "title", RUBY_METHOD_FUNC(article_title), 0);
168
+ rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0);
169
+
170
+ rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
171
+ rb_define_module_function(mOTS, "dictionaries", RUBY_METHOD_FUNC(ots_dictionaries), 0);
172
+
173
+ rb_define_alloc_func(cArticle, article_allocate);
174
+
175
+ rb_define_const(mOTS, "VERSION", rb_str_new2(RUBY_OTS_VERSION));
197
176
  }