ots 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +80 -0
- data/dictionaries/bg.xml +101 -0
- data/dictionaries/ca.xml +141 -0
- data/dictionaries/cs.xml +161 -0
- data/dictionaries/cy.xml +118 -0
- data/dictionaries/da.xml +129 -0
- data/dictionaries/de.xml +354 -0
- data/dictionaries/el.xml +80 -0
- data/dictionaries/en.xml +606 -0
- data/dictionaries/eo.xml +171 -0
- data/dictionaries/es.xml +369 -0
- data/dictionaries/et.xml +172 -0
- data/dictionaries/eu.xml +77 -0
- data/dictionaries/fi.xml +105 -0
- data/dictionaries/fr.xml +199 -0
- data/dictionaries/ga.xml +124 -0
- data/dictionaries/gl.xml +290 -0
- data/dictionaries/he.xml +334 -0
- data/dictionaries/hu.xml +280 -0
- data/dictionaries/ia.xml +97 -0
- data/dictionaries/id.xml +75 -0
- data/dictionaries/is.xml +201 -0
- data/dictionaries/it.xml +206 -0
- data/dictionaries/lv.xml +77 -0
- data/dictionaries/mi.xml +76 -0
- data/dictionaries/ms.xml +160 -0
- data/dictionaries/mt.xml +73 -0
- data/dictionaries/nl.xml +245 -0
- data/dictionaries/nn.xml +264 -0
- data/dictionaries/pl.xml +92 -0
- data/dictionaries/pt.xml +365 -0
- data/dictionaries/ro.xml +163 -0
- data/dictionaries/ru.xml +150 -0
- data/dictionaries/sv.xml +255 -0
- data/dictionaries/tl.xml +67 -0
- data/dictionaries/tr.xml +65 -0
- data/dictionaries/uk.xml +98 -0
- data/dictionaries/yi.xml +293 -0
- data/ext/article.c +119 -0
- data/ext/dictionary.c +335 -0
- data/ext/extconf.rb +13 -14
- data/ext/grader-tc.c +185 -0
- data/ext/grader-tc.h +64 -0
- data/ext/grader-tf.c +116 -0
- data/ext/grader.c +85 -0
- data/ext/highlighter.c +128 -0
- data/ext/html.c +131 -0
- data/ext/libots.h +158 -0
- data/ext/ots.c +130 -151
- data/ext/ots.h +15 -0
- data/ext/parser.c +173 -0
- data/ext/relations.c +163 -0
- data/ext/stemmer.c +332 -0
- data/ext/text.c +98 -0
- data/ext/version.h +2 -0
- data/ext/wordlist.c +220 -0
- data/test/helper.rb +3 -0
- data/test/test_article.rb +52 -0
- data/test/test_ots.rb +23 -0
- metadata +122 -38
- data/README +0 -25
- data/VERSION +0 -1
- data/lib/ots.rb +0 -1
- data/test/ots_test.rb +0 -62
data/ext/highlighter.c
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
/*
|
2
|
+
* highlighter
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
#include "libots.h"
|
25
|
+
|
26
|
+
/*After the grader has graded the article and each
|
27
|
+
sentence has a score the highlighter will select
|
28
|
+
some of the sentences*/
|
29
|
+
|
30
|
+
static int
|
31
|
+
ots_highlight_max_line (OtsArticle * Doc)
|
32
|
+
{
|
33
|
+
GList *li;
|
34
|
+
int max = 0;
|
35
|
+
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
36
|
+
{
|
37
|
+
if (0 == (((OtsSentence *) li->data)->selected)) /* if not selected , count me in */
|
38
|
+
max = MAX (((OtsSentence *) li->data)->score, max);
|
39
|
+
|
40
|
+
}
|
41
|
+
|
42
|
+
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
43
|
+
{
|
44
|
+
|
45
|
+
if ((((OtsSentence *) li->data)->score == max) && (((OtsSentence *) li->data)->selected == 0)) /* if score==max && not selected before ,select me; */
|
46
|
+
{
|
47
|
+
((OtsSentence *) li->data)->selected = 1;
|
48
|
+
return ((OtsSentence *) li->data)->wc;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
return 0;
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
/* todo: impement this
|
57
|
+
|
58
|
+
void
|
59
|
+
ots_highlight_doc_wordcount (OtsArticle * Doc, int wordCount)
|
60
|
+
|
61
|
+
void
|
62
|
+
ots_highlight_doc_linecount (OtsArticle * Doc, int wordCount)
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
void
|
67
|
+
ots_highlight_doc_soft (OtsArticle * Doc, int percent) //blur selection by avrage of near sentences , will mark blocks
|
68
|
+
*/
|
69
|
+
|
70
|
+
void
|
71
|
+
ots_highlight_doc (OtsArticle * Doc, int percent)
|
72
|
+
{
|
73
|
+
int i;
|
74
|
+
double ratio;
|
75
|
+
int wordCount;
|
76
|
+
|
77
|
+
if (0 == Doc->lineCount)
|
78
|
+
return;
|
79
|
+
|
80
|
+
if (percent > 100)
|
81
|
+
percent = 100;
|
82
|
+
else if (percent < 0)
|
83
|
+
percent = 0;
|
84
|
+
|
85
|
+
ratio = ((double) (percent)) / (100.0);
|
86
|
+
|
87
|
+
wordCount = ots_get_article_word_count (Doc);
|
88
|
+
|
89
|
+
for (i = 0; i < (ratio * (double) wordCount);)
|
90
|
+
{
|
91
|
+
i += ots_highlight_max_line (Doc);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
void
|
96
|
+
ots_highlight_doc_lines (OtsArticle * Doc, int lines)
|
97
|
+
{
|
98
|
+
int i;
|
99
|
+
int lineCount;
|
100
|
+
int tmp;
|
101
|
+
|
102
|
+
if (0 == Doc->lineCount) return;
|
103
|
+
|
104
|
+
lineCount = Doc->lineCount;
|
105
|
+
i=0;
|
106
|
+
while ((i<lines)&&(i<lineCount))
|
107
|
+
{
|
108
|
+
i++;
|
109
|
+
tmp=ots_highlight_max_line (Doc);
|
110
|
+
}
|
111
|
+
|
112
|
+
}
|
113
|
+
|
114
|
+
void ots_highlight_doc_words (OtsArticle * Doc, int words)
|
115
|
+
{
|
116
|
+
int i;
|
117
|
+
int docWordCount;
|
118
|
+
|
119
|
+
if (0 == Doc->lineCount) return;
|
120
|
+
|
121
|
+
docWordCount = ots_get_article_word_count (Doc);
|
122
|
+
|
123
|
+
i=0;
|
124
|
+
while ((i < docWordCount) && (i <= words))
|
125
|
+
{
|
126
|
+
i += ots_highlight_max_line (Doc);
|
127
|
+
}
|
128
|
+
}
|
data/ext/html.c
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
/*
|
2
|
+
* html.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
#include "libots.h"
|
25
|
+
|
26
|
+
static unsigned char *
|
27
|
+
ots_get_line_HTML (const OtsSentence * aLine, size_t * out_size)
|
28
|
+
{
|
29
|
+
GList *li;
|
30
|
+
GString *text;
|
31
|
+
unsigned char *utf8_data;
|
32
|
+
char *score_str;
|
33
|
+
text = g_string_new (NULL);
|
34
|
+
|
35
|
+
score_str=g_new0(char,32);
|
36
|
+
sprintf(score_str,"<!--(%ld)-->",aLine->score);
|
37
|
+
g_string_append (text,score_str);
|
38
|
+
g_free(score_str);
|
39
|
+
|
40
|
+
if ((aLine->selected))
|
41
|
+
{
|
42
|
+
g_string_append (text,
|
43
|
+
"<FONT COLOR=\"#16569E\"><span style=\'background:yellow;\'>");
|
44
|
+
}
|
45
|
+
else
|
46
|
+
{
|
47
|
+
g_string_append (text, "<FONT COLOR=\"#16569E\"><span>");
|
48
|
+
}
|
49
|
+
|
50
|
+
for (li = (GList *) aLine->words; li != NULL; li = li->next)
|
51
|
+
{
|
52
|
+
if (0 == strcmp ((char *) li->data, "\n"))
|
53
|
+
g_string_append (text, "<br>");
|
54
|
+
else
|
55
|
+
g_string_append (text, (char *) li->data);
|
56
|
+
}
|
57
|
+
g_string_append (text,"</span></FONT>\n");
|
58
|
+
|
59
|
+
if (out_size)
|
60
|
+
*out_size = text->len;
|
61
|
+
|
62
|
+
utf8_data = text->str;
|
63
|
+
g_string_free (text, FALSE);
|
64
|
+
|
65
|
+
return utf8_data;
|
66
|
+
}
|
67
|
+
|
68
|
+
|
69
|
+
#if 0
|
70
|
+
static void
|
71
|
+
ots_print_line_HTML (FILE * stream, const OtsSentence * aLine)
|
72
|
+
{
|
73
|
+
unsigned char *utf8_txt;
|
74
|
+
size_t len;
|
75
|
+
|
76
|
+
utf8_txt = ots_get_line_HTML (aLine, &len);
|
77
|
+
fwrite (utf8_txt, 1, len, stream);
|
78
|
+
g_free (utf8_txt);
|
79
|
+
}
|
80
|
+
#endif
|
81
|
+
|
82
|
+
|
83
|
+
unsigned char *
|
84
|
+
ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len)
|
85
|
+
{
|
86
|
+
GList *li;
|
87
|
+
GString *text;
|
88
|
+
unsigned char *utf8_data;
|
89
|
+
size_t line_len;
|
90
|
+
|
91
|
+
text = g_string_new (NULL);
|
92
|
+
|
93
|
+
|
94
|
+
g_string_append (text,
|
95
|
+
"<html>\n<head>\n<title>OTS</title>\n<meta charset=\"utf-8\">\n</head>\n<body>\n");
|
96
|
+
g_string_append (text, "<!-- Generated by OpenTextSummarizer -->\n");
|
97
|
+
g_string_append (text, "<!--");
|
98
|
+
g_string_append (text, Doc->title);
|
99
|
+
g_string_append (text, "-->\n");
|
100
|
+
|
101
|
+
|
102
|
+
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
103
|
+
{
|
104
|
+
utf8_data = ots_get_line_HTML ((OtsSentence *) li->data, &line_len);
|
105
|
+
g_string_append_len (text, utf8_data, line_len);
|
106
|
+
g_free (utf8_data);
|
107
|
+
}
|
108
|
+
g_string_append (text, "</body></html>\n");
|
109
|
+
|
110
|
+
if (out_len)
|
111
|
+
*out_len = text->len;
|
112
|
+
utf8_data = text->str;
|
113
|
+
|
114
|
+
g_string_free (text, FALSE);
|
115
|
+
return utf8_data;
|
116
|
+
|
117
|
+
}
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
void
|
122
|
+
ots_print_HTML (FILE * stream, const OtsArticle * Doc)
|
123
|
+
{
|
124
|
+
unsigned char *utf8_txt;
|
125
|
+
size_t len;
|
126
|
+
|
127
|
+
utf8_txt = ots_get_doc_HTML (Doc, &len);
|
128
|
+
fwrite (utf8_txt, 1, len, stream);
|
129
|
+
g_free (utf8_txt);
|
130
|
+
|
131
|
+
}
|
data/ext/libots.h
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
/*
|
2
|
+
* libots.h
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#ifndef HAVE_LIBOTS_H
|
22
|
+
#define HAVE_LIBOTS_H
|
23
|
+
|
24
|
+
#include <glib.h>
|
25
|
+
|
26
|
+
G_BEGIN_DECLS
|
27
|
+
|
28
|
+
typedef struct
|
29
|
+
{ /* the Term Frequency data structure */
|
30
|
+
char* word;
|
31
|
+
double tf; /*Also used for TF*/
|
32
|
+
} OtsWordTF;
|
33
|
+
|
34
|
+
|
35
|
+
typedef struct
|
36
|
+
{
|
37
|
+
/*a GList of char* */
|
38
|
+
GList *RemovePre; /* (a|b) replace string a with b */
|
39
|
+
GList *RemovePost;
|
40
|
+
GList *step1_pre;
|
41
|
+
GList *step1_post;
|
42
|
+
|
43
|
+
GList *synonyms;
|
44
|
+
GList *manual;
|
45
|
+
|
46
|
+
GList *ParserBreak;
|
47
|
+
GList *ParserDontBreak;
|
48
|
+
|
49
|
+
|
50
|
+
/*to be implemented*/
|
51
|
+
GList *ReplaceChars;
|
52
|
+
|
53
|
+
} OtsStemRule;
|
54
|
+
|
55
|
+
|
56
|
+
typedef struct
|
57
|
+
{
|
58
|
+
GList *words; /* a Glist of words (char*) */
|
59
|
+
glong score; /*score set by the grader*/
|
60
|
+
gboolean selected; /*is selected?*/
|
61
|
+
gint wc; /*word count*/
|
62
|
+
void *user_data; /*pointer to the original sentence , or serial number maybe*/
|
63
|
+
} OtsSentence;
|
64
|
+
|
65
|
+
|
66
|
+
typedef struct
|
67
|
+
{
|
68
|
+
GList *lines; /* a Glist of sentences (struct Sentence) */
|
69
|
+
gint lineCount; /*lines in the text*/
|
70
|
+
char *title; /*title , auto generated*/
|
71
|
+
|
72
|
+
OtsStemRule *stem; /*stemming & parsing rules*/
|
73
|
+
|
74
|
+
/*Term Frequency grader*/
|
75
|
+
GList *tf_terms;
|
76
|
+
GList *idf_terms;
|
77
|
+
|
78
|
+
|
79
|
+
/*Term Count grader*/
|
80
|
+
GList *dict; /* dictionary from xml*/
|
81
|
+
GList *wordStat; /* a wordlist of all words in the article and their occ */
|
82
|
+
GList *ImpWords; /*important words - for term count grader*/
|
83
|
+
|
84
|
+
|
85
|
+
} OtsArticle;
|
86
|
+
|
87
|
+
|
88
|
+
OtsArticle *ots_new_article (void);
|
89
|
+
void ots_free_article (OtsArticle *art);
|
90
|
+
|
91
|
+
/*parser*/
|
92
|
+
void ots_parse_file (FILE * stream, OtsArticle * Doc); /*file input */
|
93
|
+
void ots_parse_stream(const unsigned char *utf8 , size_t len ,OtsArticle *Doc); /*parse unicode stream*/
|
94
|
+
|
95
|
+
OtsSentence *ots_append_line (OtsArticle * Doc);
|
96
|
+
void ots_append_word (OtsSentence * aLine,unsigned const char *aWord);
|
97
|
+
void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
|
98
|
+
|
99
|
+
|
100
|
+
/*dictionary*/
|
101
|
+
gboolean ots_load_xml_dictionary (OtsArticle * Doc, const char *name);
|
102
|
+
|
103
|
+
int ots_get_article_word_count (const OtsArticle * Doc);
|
104
|
+
|
105
|
+
|
106
|
+
/*grader*/
|
107
|
+
void ots_highlight_doc (OtsArticle * Doc, int percent); /*example: 20%*/
|
108
|
+
void ots_highlight_doc_lines (OtsArticle * Doc, int lines); /*example: 10 lines*/
|
109
|
+
void ots_highlight_doc_words (OtsArticle * Doc, int words); /*example: 50 words*/
|
110
|
+
|
111
|
+
void ots_grade_doc (OtsArticle * Doc);
|
112
|
+
|
113
|
+
void ots_free_OtsWordTF(OtsWordTF *obj); /*todo: put in .h file*/
|
114
|
+
OtsWordTF* ots_new_OtsWordTF(const char* word,const double idf);
|
115
|
+
|
116
|
+
|
117
|
+
/*HTML output*/
|
118
|
+
void ots_print_HTML (FILE * stream, const OtsArticle * Doc);
|
119
|
+
unsigned char *ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len);
|
120
|
+
|
121
|
+
/*TEXT output*/
|
122
|
+
void ots_print_doc (FILE * stream, const OtsArticle * Doc);
|
123
|
+
unsigned char *ots_get_doc_text (const OtsArticle * Doc, size_t * out_len);
|
124
|
+
|
125
|
+
|
126
|
+
/*Plugin writing*/
|
127
|
+
unsigned char* ots_get_line_text (const OtsSentence *aLine, gboolean only_if_selected, size_t *out_size);
|
128
|
+
gboolean ots_is_line_selected(const OtsSentence *aLine);
|
129
|
+
|
130
|
+
/*Stemm support*/
|
131
|
+
OtsStemRule *new_stem_rule(void);
|
132
|
+
void free_stem_rule (OtsStemRule *rule);
|
133
|
+
unsigned char * ots_stem_strip (unsigned const char * aWord, const OtsStemRule *rule); /*returns newly allocated string with the root of the word*/
|
134
|
+
unsigned char *ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule); /*Remove leading spaces, comas, colons, etc. */
|
135
|
+
|
136
|
+
/*Relations between texts*/
|
137
|
+
|
138
|
+
/*Returns the number of topics that two blocks of text share*/
|
139
|
+
int ots_text_relations(
|
140
|
+
const unsigned char *text1,const unsigned char *lang_code1,
|
141
|
+
const unsigned char *text2,const unsigned char *lang_code2,const int topic_num);
|
142
|
+
|
143
|
+
/*For a given text, return the list of the topics*/
|
144
|
+
char* ots_text_topics(const unsigned char *text,const unsigned char *lang_code,int topic_num);
|
145
|
+
|
146
|
+
|
147
|
+
/*For a given text, return the list of the stemmed topics*/
|
148
|
+
GList* ots_text_stem_list(const unsigned char *text,const unsigned char *lang_code,int topic_num);
|
149
|
+
|
150
|
+
|
151
|
+
/*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
|
152
|
+
int ots_topic_list_score(const GList *topic_list1,const GList *topic_list2);
|
153
|
+
|
154
|
+
G_END_DECLS
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
#endif /* HAVE_LIBOTS_H */
|
data/ext/ots.c
CHANGED
@@ -1,197 +1,176 @@
|
|
1
|
-
#include
|
1
|
+
#include "ots.h"
|
2
|
+
#include <sys/types.h>
|
3
|
+
#include <dirent.h>
|
4
|
+
#include <errno.h>
|
2
5
|
|
3
|
-
|
4
|
-
#ifdef RUBY_VM
|
5
|
-
#include <ruby/encoding.h>
|
6
|
-
#endif
|
6
|
+
static VALUE mOTS, cArticle;
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
8
|
+
static void article_free(OtsArticle *article) {
|
9
|
+
if (article)
|
10
|
+
ots_free_article(article);
|
11
|
+
}
|
11
12
|
|
12
|
-
|
13
|
+
VALUE article_allocate(VALUE klass) {
|
14
|
+
OtsArticle *article = ots_new_article();
|
15
|
+
return Data_Wrap_Struct(klass, 0, article_free, article);
|
16
|
+
}
|
13
17
|
|
14
|
-
|
15
|
-
|
18
|
+
OtsArticle* article_handle(VALUE self) {
|
19
|
+
OtsArticle *article = 0;
|
20
|
+
Data_Get_Struct(self, OtsArticle, article);
|
21
|
+
if (!article)
|
22
|
+
rb_raise(rb_eArgError, "invalid OTS::Article instance");
|
23
|
+
return article;
|
24
|
+
}
|
16
25
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
26
|
+
void article_load_dictionary(OtsArticle *article, char *name) {
|
27
|
+
if (!ots_load_xml_dictionary(article, name)) {
|
28
|
+
rb_raise(rb_eLoadError, "Could not find dictionary file: %s", name);
|
29
|
+
}
|
30
|
+
}
|
21
31
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
gint occ; /* how many times have we seen this word in the text? */
|
26
|
-
} OtsWordEntery;
|
32
|
+
VALUE article_initialize(int argc, VALUE *argv, VALUE self) {
|
33
|
+
VALUE text, dictionary;
|
34
|
+
OtsArticle *article = article_handle(self);
|
27
35
|
|
36
|
+
rb_scan_args(argc, argv, "11", &text, &dictionary);
|
28
37
|
|
29
|
-
|
38
|
+
if (TYPE(text) != T_STRING)
|
39
|
+
rb_raise(rb_eArgError, "invalid +text+");
|
30
40
|
|
31
|
-
|
32
|
-
|
33
|
-
if (rb_article_object == Qnil) {
|
34
|
-
if (error_on_missing)
|
35
|
-
rb_raise(eRuntimeError, "libots document not initialized properly. Did you forget to parse content ?");
|
41
|
+
if (NIL_P(dictionary))
|
42
|
+
article_load_dictionary(article, "en");
|
36
43
|
else
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
44
|
+
article_load_dictionary(article, CSTRING(dictionary));
|
45
|
+
|
46
|
+
ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article);
|
47
|
+
ots_grade_doc(article);
|
41
48
|
|
42
|
-
|
43
|
-
|
44
|
-
|
49
|
+
rb_iv_set(self, "@encoding", (VALUE)rb_enc_get(text));
|
50
|
+
|
51
|
+
return self;
|
45
52
|
}
|
46
53
|
|
47
|
-
VALUE rb_string(char *utf8) {
|
48
|
-
VALUE str = rb_str_new(utf8, strlen(utf8));
|
49
54
|
|
50
|
-
|
51
|
-
|
52
|
-
rb_enc_associate(str, rb_to_encoding(rb_str_new2("UTF-8")));
|
53
|
-
ENC_CODERANGE_CLEAR(str);
|
54
|
-
#endif
|
55
|
+
VALUE article_summary(OtsArticle *article, rb_encoding *encoding) {
|
56
|
+
OtsSentence *sentence;
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
GList *line_ptr = article->lines;
|
59
|
+
VALUE summary = rb_ary_new();
|
58
60
|
|
59
|
-
|
61
|
+
while (line_ptr != NULL) {
|
62
|
+
sentence = (OtsSentence *)line_ptr->data;
|
60
63
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
if (article != NULL) {
|
65
|
-
dict = rb_iv_get(self, "@dict");
|
66
|
-
ots_free_article(article);
|
67
|
-
}
|
68
|
-
article = ots_new_article();
|
69
|
-
rb_iv_set(self, "@article", Data_Wrap_Struct(rb_cObject, 0, 0, article));
|
70
|
-
rb_iv_set(self, "@dict", dict);
|
71
|
-
return self;
|
72
|
-
}
|
64
|
+
if (sentence->selected) {
|
65
|
+
size_t size;
|
66
|
+
unsigned char* content = ots_get_line_text(sentence, TRUE, &size);
|
73
67
|
|
74
|
-
VALUE
|
75
|
-
|
76
|
-
|
68
|
+
VALUE line = rb_hash_new();
|
69
|
+
rb_hash_aset(line, ID2SYM(rb_intern("sentence")), rb_enc_str_new((char *)content, size, encoding));
|
70
|
+
rb_hash_aset(line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
|
71
|
+
rb_ary_push(summary, line);
|
77
72
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
article = get_article(self, TRUE);
|
82
|
-
}
|
73
|
+
// reset this so subsequent calls work right.
|
74
|
+
sentence->selected = FALSE;
|
75
|
+
}
|
83
76
|
|
84
|
-
|
85
|
-
rb_ots_free_article(self);
|
86
|
-
rb_raise(eLoadError, "Could not find dictionary file: %s", dict_cstr);
|
77
|
+
line_ptr = g_list_next(line_ptr);
|
87
78
|
}
|
88
79
|
|
89
|
-
|
90
|
-
return Qtrue;
|
80
|
+
return summary;
|
91
81
|
}
|
92
82
|
|
93
|
-
VALUE
|
94
|
-
|
95
|
-
|
83
|
+
VALUE article_summarize(VALUE self, VALUE options) {
|
84
|
+
VALUE lines, percent;
|
85
|
+
OtsArticle *article = article_handle(self);
|
96
86
|
|
97
|
-
|
98
|
-
|
99
|
-
OtsArticle *article = get_article(self, TRUE);
|
100
|
-
ots_parse_stream(string_cstr, string_len, article);
|
101
|
-
ots_grade_doc(article);
|
102
|
-
return Qtrue;
|
103
|
-
}
|
87
|
+
if (TYPE(options) != T_HASH)
|
88
|
+
rb_raise(rb_eArgError, "expect an options hash");
|
104
89
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
90
|
+
lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
|
91
|
+
percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
|
92
|
+
|
93
|
+
if (NIL_P(lines) && NIL_P(percent))
|
94
|
+
rb_raise(rb_eArgError, "expect +lines+ or +percent+ to be provided");
|
110
95
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
96
|
+
if (lines != Qnil)
|
97
|
+
ots_highlight_doc_lines(article, NUM2INT(lines));
|
98
|
+
else
|
99
|
+
ots_highlight_doc(article, NUM2INT(percent));
|
100
|
+
|
101
|
+
return article_summary(article, (rb_encoding *)rb_iv_get(self, "@encoding"));
|
115
102
|
}
|
116
103
|
|
117
|
-
VALUE
|
118
|
-
|
119
|
-
|
120
|
-
return rb_string(article->title);
|
121
|
-
else
|
122
|
-
return Qnil;
|
104
|
+
VALUE article_title(VALUE self) {
|
105
|
+
OtsArticle *article = article_handle(self);
|
106
|
+
return (article->title ? rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")) : Qnil);
|
123
107
|
}
|
124
108
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
OtsWordEntery *data = (OtsWordEntery *)words->data;
|
131
|
-
if (data != NULL && strlen(data->word) > 0)
|
132
|
-
rb_ary_push(iwords, rb_string(data->word));
|
133
|
-
words = words->next;
|
134
|
-
}
|
109
|
+
typedef struct {
|
110
|
+
gchar *word; /* the word */
|
111
|
+
gchar *stem; /*stem of the word*/
|
112
|
+
gint occ; /* how many times have we seen this word in the text? */
|
113
|
+
} OtsWordEntry;
|
135
114
|
|
136
|
-
return iwords;
|
137
|
-
}
|
138
115
|
|
139
|
-
VALUE
|
140
|
-
|
141
|
-
|
142
|
-
GList *curr_line = article->lines;
|
143
|
-
VALUE hlt_lines = rb_ary_new();
|
116
|
+
VALUE article_keywords(VALUE self) {
|
117
|
+
OtsArticle *article = article_handle(self);
|
118
|
+
rb_encoding *encoding = (rb_encoding*)rb_iv_get(self, "@encoding");
|
144
119
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
rb_ary_push(hlt_lines, hlt_line);
|
120
|
+
VALUE words = rb_ary_new();
|
121
|
+
GList* word_ptr = article->ImpWords;
|
122
|
+
|
123
|
+
while (word_ptr) {
|
124
|
+
OtsWordEntry *data = (OtsWordEntry *)word_ptr->data;
|
125
|
+
if (data && strlen(data->word) > 0)
|
126
|
+
rb_ary_push(words, rb_enc_str_new2(data->word, encoding));
|
127
|
+
word_ptr = word_ptr->next;
|
154
128
|
}
|
155
|
-
curr_line = g_list_next(curr_line);
|
156
|
-
}
|
157
129
|
|
158
|
-
|
130
|
+
return words;
|
159
131
|
}
|
160
132
|
|
161
|
-
VALUE
|
162
|
-
|
163
|
-
|
164
|
-
|
133
|
+
VALUE ots_parse(int argc, VALUE *argv, VALUE self) {
|
134
|
+
VALUE article = article_allocate(cArticle);
|
135
|
+
article_initialize(argc, argv, article);
|
136
|
+
return article;
|
137
|
+
}
|
165
138
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
139
|
+
VALUE ots_dictionaries(VALUE self) {
|
140
|
+
DIR *dir;
|
141
|
+
struct dirent *entry;
|
142
|
+
VALUE dictionaries = rb_ary_new();
|
143
|
+
|
144
|
+
if ((dir = opendir(DICTIONARY_DIR))) {
|
145
|
+
while ((entry = readdir(dir))) {
|
146
|
+
// entry->d_type is not portable.
|
147
|
+
if (strstr(entry->d_name, ".xml"))
|
148
|
+
rb_ary_push(dictionaries, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
|
149
|
+
}
|
150
|
+
}
|
151
|
+
else {
|
152
|
+
rb_raise(rb_eIOError, "unable to open dictionary directory: %s", strerror(errno));
|
153
|
+
}
|
174
154
|
|
175
|
-
|
176
|
-
|
177
|
-
else if (percent != Qnil)
|
178
|
-
rb_ots_highlight_percent(self, FIX2INT(percent));
|
179
|
-
return rb_ots_get_highlighted_lines(self);
|
155
|
+
closedir(dir);
|
156
|
+
return dictionaries;
|
180
157
|
}
|
181
158
|
|
182
159
|
/* init */
|
183
160
|
|
184
161
|
void Init_ots(void) {
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
rb_define_method(
|
190
|
-
rb_define_method(
|
191
|
-
rb_define_method(
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
162
|
+
mOTS = rb_define_module("OTS");
|
163
|
+
cArticle = rb_define_class_under(mOTS, "Article", rb_cObject);
|
164
|
+
|
165
|
+
rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1);
|
166
|
+
rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1);
|
167
|
+
rb_define_method(cArticle, "title", RUBY_METHOD_FUNC(article_title), 0);
|
168
|
+
rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0);
|
169
|
+
|
170
|
+
rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
|
171
|
+
rb_define_module_function(mOTS, "dictionaries", RUBY_METHOD_FUNC(ots_dictionaries), 0);
|
172
|
+
|
173
|
+
rb_define_alloc_func(cArticle, article_allocate);
|
174
|
+
|
175
|
+
rb_define_const(mOTS, "VERSION", rb_str_new2(RUBY_OTS_VERSION));
|
197
176
|
}
|