ots 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +80 -0
- data/dictionaries/bg.xml +101 -0
- data/dictionaries/ca.xml +141 -0
- data/dictionaries/cs.xml +161 -0
- data/dictionaries/cy.xml +118 -0
- data/dictionaries/da.xml +129 -0
- data/dictionaries/de.xml +354 -0
- data/dictionaries/el.xml +80 -0
- data/dictionaries/en.xml +606 -0
- data/dictionaries/eo.xml +171 -0
- data/dictionaries/es.xml +369 -0
- data/dictionaries/et.xml +172 -0
- data/dictionaries/eu.xml +77 -0
- data/dictionaries/fi.xml +105 -0
- data/dictionaries/fr.xml +199 -0
- data/dictionaries/ga.xml +124 -0
- data/dictionaries/gl.xml +290 -0
- data/dictionaries/he.xml +334 -0
- data/dictionaries/hu.xml +280 -0
- data/dictionaries/ia.xml +97 -0
- data/dictionaries/id.xml +75 -0
- data/dictionaries/is.xml +201 -0
- data/dictionaries/it.xml +206 -0
- data/dictionaries/lv.xml +77 -0
- data/dictionaries/mi.xml +76 -0
- data/dictionaries/ms.xml +160 -0
- data/dictionaries/mt.xml +73 -0
- data/dictionaries/nl.xml +245 -0
- data/dictionaries/nn.xml +264 -0
- data/dictionaries/pl.xml +92 -0
- data/dictionaries/pt.xml +365 -0
- data/dictionaries/ro.xml +163 -0
- data/dictionaries/ru.xml +150 -0
- data/dictionaries/sv.xml +255 -0
- data/dictionaries/tl.xml +67 -0
- data/dictionaries/tr.xml +65 -0
- data/dictionaries/uk.xml +98 -0
- data/dictionaries/yi.xml +293 -0
- data/ext/article.c +119 -0
- data/ext/dictionary.c +335 -0
- data/ext/extconf.rb +13 -14
- data/ext/grader-tc.c +185 -0
- data/ext/grader-tc.h +64 -0
- data/ext/grader-tf.c +116 -0
- data/ext/grader.c +85 -0
- data/ext/highlighter.c +128 -0
- data/ext/html.c +131 -0
- data/ext/libots.h +158 -0
- data/ext/ots.c +130 -151
- data/ext/ots.h +15 -0
- data/ext/parser.c +173 -0
- data/ext/relations.c +163 -0
- data/ext/stemmer.c +332 -0
- data/ext/text.c +98 -0
- data/ext/version.h +2 -0
- data/ext/wordlist.c +220 -0
- data/test/helper.rb +3 -0
- data/test/test_article.rb +52 -0
- data/test/test_ots.rb +23 -0
- metadata +122 -38
- data/README +0 -25
- data/VERSION +0 -1
- data/lib/ots.rb +0 -1
- data/test/ots_test.rb +0 -62
data/ext/highlighter.c
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
/*
|
2
|
+
* highlighter
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
#include "libots.h"
|
25
|
+
|
26
|
+
/*After the grader has graded the article and each
|
27
|
+
sentence has a score the highlighter will select
|
28
|
+
some of the sentences*/
|
29
|
+
|
30
|
+
static int
|
31
|
+
ots_highlight_max_line (OtsArticle * Doc)
|
32
|
+
{
|
33
|
+
GList *li;
|
34
|
+
int max = 0;
|
35
|
+
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
36
|
+
{
|
37
|
+
if (0 == (((OtsSentence *) li->data)->selected)) /* if not selected , count me in */
|
38
|
+
max = MAX (((OtsSentence *) li->data)->score, max);
|
39
|
+
|
40
|
+
}
|
41
|
+
|
42
|
+
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
43
|
+
{
|
44
|
+
|
45
|
+
if ((((OtsSentence *) li->data)->score == max) && (((OtsSentence *) li->data)->selected == 0)) /* if score==max && not selected before ,select me; */
|
46
|
+
{
|
47
|
+
((OtsSentence *) li->data)->selected = 1;
|
48
|
+
return ((OtsSentence *) li->data)->wc;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
return 0;
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
/* todo: impement this
|
57
|
+
|
58
|
+
void
|
59
|
+
ots_highlight_doc_wordcount (OtsArticle * Doc, int wordCount)
|
60
|
+
|
61
|
+
void
|
62
|
+
ots_highlight_doc_linecount (OtsArticle * Doc, int wordCount)
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
void
|
67
|
+
ots_highlight_doc_soft (OtsArticle * Doc, int percent) //blur selection by avrage of near sentences , will mark blocks
|
68
|
+
*/
|
69
|
+
|
70
|
+
void
|
71
|
+
ots_highlight_doc (OtsArticle * Doc, int percent)
|
72
|
+
{
|
73
|
+
int i;
|
74
|
+
double ratio;
|
75
|
+
int wordCount;
|
76
|
+
|
77
|
+
if (0 == Doc->lineCount)
|
78
|
+
return;
|
79
|
+
|
80
|
+
if (percent > 100)
|
81
|
+
percent = 100;
|
82
|
+
else if (percent < 0)
|
83
|
+
percent = 0;
|
84
|
+
|
85
|
+
ratio = ((double) (percent)) / (100.0);
|
86
|
+
|
87
|
+
wordCount = ots_get_article_word_count (Doc);
|
88
|
+
|
89
|
+
for (i = 0; i < (ratio * (double) wordCount);)
|
90
|
+
{
|
91
|
+
i += ots_highlight_max_line (Doc);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
void
|
96
|
+
ots_highlight_doc_lines (OtsArticle * Doc, int lines)
|
97
|
+
{
|
98
|
+
int i;
|
99
|
+
int lineCount;
|
100
|
+
int tmp;
|
101
|
+
|
102
|
+
if (0 == Doc->lineCount) return;
|
103
|
+
|
104
|
+
lineCount = Doc->lineCount;
|
105
|
+
i=0;
|
106
|
+
while ((i<lines)&&(i<lineCount))
|
107
|
+
{
|
108
|
+
i++;
|
109
|
+
tmp=ots_highlight_max_line (Doc);
|
110
|
+
}
|
111
|
+
|
112
|
+
}
|
113
|
+
|
114
|
+
void ots_highlight_doc_words (OtsArticle * Doc, int words)
|
115
|
+
{
|
116
|
+
int i;
|
117
|
+
int docWordCount;
|
118
|
+
|
119
|
+
if (0 == Doc->lineCount) return;
|
120
|
+
|
121
|
+
docWordCount = ots_get_article_word_count (Doc);
|
122
|
+
|
123
|
+
i=0;
|
124
|
+
while ((i < docWordCount) && (i <= words))
|
125
|
+
{
|
126
|
+
i += ots_highlight_max_line (Doc);
|
127
|
+
}
|
128
|
+
}
|
data/ext/html.c
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
/*
|
2
|
+
* html.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
#include "libots.h"
|
25
|
+
|
26
|
+
static unsigned char *
|
27
|
+
ots_get_line_HTML (const OtsSentence * aLine, size_t * out_size)
|
28
|
+
{
|
29
|
+
GList *li;
|
30
|
+
GString *text;
|
31
|
+
unsigned char *utf8_data;
|
32
|
+
char *score_str;
|
33
|
+
text = g_string_new (NULL);
|
34
|
+
|
35
|
+
score_str=g_new0(char,32);
|
36
|
+
sprintf(score_str,"<!--(%ld)-->",aLine->score);
|
37
|
+
g_string_append (text,score_str);
|
38
|
+
g_free(score_str);
|
39
|
+
|
40
|
+
if ((aLine->selected))
|
41
|
+
{
|
42
|
+
g_string_append (text,
|
43
|
+
"<FONT COLOR=\"#16569E\"><span style=\'background:yellow;\'>");
|
44
|
+
}
|
45
|
+
else
|
46
|
+
{
|
47
|
+
g_string_append (text, "<FONT COLOR=\"#16569E\"><span>");
|
48
|
+
}
|
49
|
+
|
50
|
+
for (li = (GList *) aLine->words; li != NULL; li = li->next)
|
51
|
+
{
|
52
|
+
if (0 == strcmp ((char *) li->data, "\n"))
|
53
|
+
g_string_append (text, "<br>");
|
54
|
+
else
|
55
|
+
g_string_append (text, (char *) li->data);
|
56
|
+
}
|
57
|
+
g_string_append (text,"</span></FONT>\n");
|
58
|
+
|
59
|
+
if (out_size)
|
60
|
+
*out_size = text->len;
|
61
|
+
|
62
|
+
utf8_data = text->str;
|
63
|
+
g_string_free (text, FALSE);
|
64
|
+
|
65
|
+
return utf8_data;
|
66
|
+
}
|
67
|
+
|
68
|
+
|
69
|
+
#if 0
|
70
|
+
static void
|
71
|
+
ots_print_line_HTML (FILE * stream, const OtsSentence * aLine)
|
72
|
+
{
|
73
|
+
unsigned char *utf8_txt;
|
74
|
+
size_t len;
|
75
|
+
|
76
|
+
utf8_txt = ots_get_line_HTML (aLine, &len);
|
77
|
+
fwrite (utf8_txt, 1, len, stream);
|
78
|
+
g_free (utf8_txt);
|
79
|
+
}
|
80
|
+
#endif
|
81
|
+
|
82
|
+
|
83
|
+
unsigned char *
|
84
|
+
ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len)
|
85
|
+
{
|
86
|
+
GList *li;
|
87
|
+
GString *text;
|
88
|
+
unsigned char *utf8_data;
|
89
|
+
size_t line_len;
|
90
|
+
|
91
|
+
text = g_string_new (NULL);
|
92
|
+
|
93
|
+
|
94
|
+
g_string_append (text,
|
95
|
+
"<html>\n<head>\n<title>OTS</title>\n<meta charset=\"utf-8\">\n</head>\n<body>\n");
|
96
|
+
g_string_append (text, "<!-- Generated by OpenTextSummarizer -->\n");
|
97
|
+
g_string_append (text, "<!--");
|
98
|
+
g_string_append (text, Doc->title);
|
99
|
+
g_string_append (text, "-->\n");
|
100
|
+
|
101
|
+
|
102
|
+
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
103
|
+
{
|
104
|
+
utf8_data = ots_get_line_HTML ((OtsSentence *) li->data, &line_len);
|
105
|
+
g_string_append_len (text, utf8_data, line_len);
|
106
|
+
g_free (utf8_data);
|
107
|
+
}
|
108
|
+
g_string_append (text, "</body></html>\n");
|
109
|
+
|
110
|
+
if (out_len)
|
111
|
+
*out_len = text->len;
|
112
|
+
utf8_data = text->str;
|
113
|
+
|
114
|
+
g_string_free (text, FALSE);
|
115
|
+
return utf8_data;
|
116
|
+
|
117
|
+
}
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
void
|
122
|
+
ots_print_HTML (FILE * stream, const OtsArticle * Doc)
|
123
|
+
{
|
124
|
+
unsigned char *utf8_txt;
|
125
|
+
size_t len;
|
126
|
+
|
127
|
+
utf8_txt = ots_get_doc_HTML (Doc, &len);
|
128
|
+
fwrite (utf8_txt, 1, len, stream);
|
129
|
+
g_free (utf8_txt);
|
130
|
+
|
131
|
+
}
|
data/ext/libots.h
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
/*
|
2
|
+
* libots.h
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#ifndef HAVE_LIBOTS_H
|
22
|
+
#define HAVE_LIBOTS_H
|
23
|
+
|
24
|
+
#include <glib.h>
|
25
|
+
|
26
|
+
G_BEGIN_DECLS
|
27
|
+
|
28
|
+
typedef struct
|
29
|
+
{ /* the Term Frequency data structure */
|
30
|
+
char* word;
|
31
|
+
double tf; /*Also used for TF*/
|
32
|
+
} OtsWordTF;
|
33
|
+
|
34
|
+
|
35
|
+
typedef struct
|
36
|
+
{
|
37
|
+
/*a GList of char* */
|
38
|
+
GList *RemovePre; /* (a|b) replace string a with b */
|
39
|
+
GList *RemovePost;
|
40
|
+
GList *step1_pre;
|
41
|
+
GList *step1_post;
|
42
|
+
|
43
|
+
GList *synonyms;
|
44
|
+
GList *manual;
|
45
|
+
|
46
|
+
GList *ParserBreak;
|
47
|
+
GList *ParserDontBreak;
|
48
|
+
|
49
|
+
|
50
|
+
/*to be implemented*/
|
51
|
+
GList *ReplaceChars;
|
52
|
+
|
53
|
+
} OtsStemRule;
|
54
|
+
|
55
|
+
|
56
|
+
typedef struct
|
57
|
+
{
|
58
|
+
GList *words; /* a Glist of words (char*) */
|
59
|
+
glong score; /*score set by the grader*/
|
60
|
+
gboolean selected; /*is selected?*/
|
61
|
+
gint wc; /*word count*/
|
62
|
+
void *user_data; /*pointer to the original sentence , or serial number maybe*/
|
63
|
+
} OtsSentence;
|
64
|
+
|
65
|
+
|
66
|
+
typedef struct
|
67
|
+
{
|
68
|
+
GList *lines; /* a Glist of sentences (struct Sentence) */
|
69
|
+
gint lineCount; /*lines in the text*/
|
70
|
+
char *title; /*title , auto generated*/
|
71
|
+
|
72
|
+
OtsStemRule *stem; /*stemming & parsing rules*/
|
73
|
+
|
74
|
+
/*Term Frequency grader*/
|
75
|
+
GList *tf_terms;
|
76
|
+
GList *idf_terms;
|
77
|
+
|
78
|
+
|
79
|
+
/*Term Count grader*/
|
80
|
+
GList *dict; /* dictionary from xml*/
|
81
|
+
GList *wordStat; /* a wordlist of all words in the article and their occ */
|
82
|
+
GList *ImpWords; /*important words - for term count grader*/
|
83
|
+
|
84
|
+
|
85
|
+
} OtsArticle;
|
86
|
+
|
87
|
+
|
88
|
+
OtsArticle *ots_new_article (void);
|
89
|
+
void ots_free_article (OtsArticle *art);
|
90
|
+
|
91
|
+
/*parser*/
|
92
|
+
void ots_parse_file (FILE * stream, OtsArticle * Doc); /*file input */
|
93
|
+
void ots_parse_stream(const unsigned char *utf8 , size_t len ,OtsArticle *Doc); /*parse unicode stream*/
|
94
|
+
|
95
|
+
OtsSentence *ots_append_line (OtsArticle * Doc);
|
96
|
+
void ots_append_word (OtsSentence * aLine,unsigned const char *aWord);
|
97
|
+
void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
|
98
|
+
|
99
|
+
|
100
|
+
/*dictionary*/
|
101
|
+
gboolean ots_load_xml_dictionary (OtsArticle * Doc, const char *name);
|
102
|
+
|
103
|
+
int ots_get_article_word_count (const OtsArticle * Doc);
|
104
|
+
|
105
|
+
|
106
|
+
/*grader*/
|
107
|
+
void ots_highlight_doc (OtsArticle * Doc, int percent); /*example: 20%*/
|
108
|
+
void ots_highlight_doc_lines (OtsArticle * Doc, int lines); /*example: 10 lines*/
|
109
|
+
void ots_highlight_doc_words (OtsArticle * Doc, int words); /*example: 50 words*/
|
110
|
+
|
111
|
+
void ots_grade_doc (OtsArticle * Doc);
|
112
|
+
|
113
|
+
void ots_free_OtsWordTF(OtsWordTF *obj); /*todo: put in .h file*/
|
114
|
+
OtsWordTF* ots_new_OtsWordTF(const char* word,const double idf);
|
115
|
+
|
116
|
+
|
117
|
+
/*HTML output*/
|
118
|
+
void ots_print_HTML (FILE * stream, const OtsArticle * Doc);
|
119
|
+
unsigned char *ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len);
|
120
|
+
|
121
|
+
/*TEXT output*/
|
122
|
+
void ots_print_doc (FILE * stream, const OtsArticle * Doc);
|
123
|
+
unsigned char *ots_get_doc_text (const OtsArticle * Doc, size_t * out_len);
|
124
|
+
|
125
|
+
|
126
|
+
/*Plugin writing*/
|
127
|
+
unsigned char* ots_get_line_text (const OtsSentence *aLine, gboolean only_if_selected, size_t *out_size);
|
128
|
+
gboolean ots_is_line_selected(const OtsSentence *aLine);
|
129
|
+
|
130
|
+
/*Stemm support*/
|
131
|
+
OtsStemRule *new_stem_rule(void);
|
132
|
+
void free_stem_rule (OtsStemRule *rule);
|
133
|
+
unsigned char * ots_stem_strip (unsigned const char * aWord, const OtsStemRule *rule); /*returns newly allocated string with the root of the word*/
|
134
|
+
unsigned char *ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule); /*Remove leading spaces, comas, colons, etc. */
|
135
|
+
|
136
|
+
/*Relations between texts*/
|
137
|
+
|
138
|
+
/*Returns the number of topics that two blocks of text share*/
|
139
|
+
int ots_text_relations(
|
140
|
+
const unsigned char *text1,const unsigned char *lang_code1,
|
141
|
+
const unsigned char *text2,const unsigned char *lang_code2,const int topic_num);
|
142
|
+
|
143
|
+
/*For a given text, return the list of the topics*/
|
144
|
+
char* ots_text_topics(const unsigned char *text,const unsigned char *lang_code,int topic_num);
|
145
|
+
|
146
|
+
|
147
|
+
/*For a given text, return the list of the stemmed topics*/
|
148
|
+
GList* ots_text_stem_list(const unsigned char *text,const unsigned char *lang_code,int topic_num);
|
149
|
+
|
150
|
+
|
151
|
+
/*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
|
152
|
+
int ots_topic_list_score(const GList *topic_list1,const GList *topic_list2);
|
153
|
+
|
154
|
+
G_END_DECLS
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
#endif /* HAVE_LIBOTS_H */
|
data/ext/ots.c
CHANGED
@@ -1,197 +1,176 @@
|
|
1
|
-
#include
|
1
|
+
#include "ots.h"
|
2
|
+
#include <sys/types.h>
|
3
|
+
#include <dirent.h>
|
4
|
+
#include <errno.h>
|
2
5
|
|
3
|
-
|
4
|
-
#ifdef RUBY_VM
|
5
|
-
#include <ruby/encoding.h>
|
6
|
-
#endif
|
6
|
+
static VALUE mOTS, cArticle;
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
8
|
+
static void article_free(OtsArticle *article) {
|
9
|
+
if (article)
|
10
|
+
ots_free_article(article);
|
11
|
+
}
|
11
12
|
|
12
|
-
|
13
|
+
VALUE article_allocate(VALUE klass) {
|
14
|
+
OtsArticle *article = ots_new_article();
|
15
|
+
return Data_Wrap_Struct(klass, 0, article_free, article);
|
16
|
+
}
|
13
17
|
|
14
|
-
|
15
|
-
|
18
|
+
OtsArticle* article_handle(VALUE self) {
|
19
|
+
OtsArticle *article = 0;
|
20
|
+
Data_Get_Struct(self, OtsArticle, article);
|
21
|
+
if (!article)
|
22
|
+
rb_raise(rb_eArgError, "invalid OTS::Article instance");
|
23
|
+
return article;
|
24
|
+
}
|
16
25
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
26
|
+
void article_load_dictionary(OtsArticle *article, char *name) {
|
27
|
+
if (!ots_load_xml_dictionary(article, name)) {
|
28
|
+
rb_raise(rb_eLoadError, "Could not find dictionary file: %s", name);
|
29
|
+
}
|
30
|
+
}
|
21
31
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
gint occ; /* how many times have we seen this word in the text? */
|
26
|
-
} OtsWordEntery;
|
32
|
+
VALUE article_initialize(int argc, VALUE *argv, VALUE self) {
|
33
|
+
VALUE text, dictionary;
|
34
|
+
OtsArticle *article = article_handle(self);
|
27
35
|
|
36
|
+
rb_scan_args(argc, argv, "11", &text, &dictionary);
|
28
37
|
|
29
|
-
|
38
|
+
if (TYPE(text) != T_STRING)
|
39
|
+
rb_raise(rb_eArgError, "invalid +text+");
|
30
40
|
|
31
|
-
|
32
|
-
|
33
|
-
if (rb_article_object == Qnil) {
|
34
|
-
if (error_on_missing)
|
35
|
-
rb_raise(eRuntimeError, "libots document not initialized properly. Did you forget to parse content ?");
|
41
|
+
if (NIL_P(dictionary))
|
42
|
+
article_load_dictionary(article, "en");
|
36
43
|
else
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
44
|
+
article_load_dictionary(article, CSTRING(dictionary));
|
45
|
+
|
46
|
+
ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article);
|
47
|
+
ots_grade_doc(article);
|
41
48
|
|
42
|
-
|
43
|
-
|
44
|
-
|
49
|
+
rb_iv_set(self, "@encoding", (VALUE)rb_enc_get(text));
|
50
|
+
|
51
|
+
return self;
|
45
52
|
}
|
46
53
|
|
47
|
-
VALUE rb_string(char *utf8) {
|
48
|
-
VALUE str = rb_str_new(utf8, strlen(utf8));
|
49
54
|
|
50
|
-
|
51
|
-
|
52
|
-
rb_enc_associate(str, rb_to_encoding(rb_str_new2("UTF-8")));
|
53
|
-
ENC_CODERANGE_CLEAR(str);
|
54
|
-
#endif
|
55
|
+
VALUE article_summary(OtsArticle *article, rb_encoding *encoding) {
|
56
|
+
OtsSentence *sentence;
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
GList *line_ptr = article->lines;
|
59
|
+
VALUE summary = rb_ary_new();
|
58
60
|
|
59
|
-
|
61
|
+
while (line_ptr != NULL) {
|
62
|
+
sentence = (OtsSentence *)line_ptr->data;
|
60
63
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
if (article != NULL) {
|
65
|
-
dict = rb_iv_get(self, "@dict");
|
66
|
-
ots_free_article(article);
|
67
|
-
}
|
68
|
-
article = ots_new_article();
|
69
|
-
rb_iv_set(self, "@article", Data_Wrap_Struct(rb_cObject, 0, 0, article));
|
70
|
-
rb_iv_set(self, "@dict", dict);
|
71
|
-
return self;
|
72
|
-
}
|
64
|
+
if (sentence->selected) {
|
65
|
+
size_t size;
|
66
|
+
unsigned char* content = ots_get_line_text(sentence, TRUE, &size);
|
73
67
|
|
74
|
-
VALUE
|
75
|
-
|
76
|
-
|
68
|
+
VALUE line = rb_hash_new();
|
69
|
+
rb_hash_aset(line, ID2SYM(rb_intern("sentence")), rb_enc_str_new((char *)content, size, encoding));
|
70
|
+
rb_hash_aset(line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
|
71
|
+
rb_ary_push(summary, line);
|
77
72
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
article = get_article(self, TRUE);
|
82
|
-
}
|
73
|
+
// reset this so subsequent calls work right.
|
74
|
+
sentence->selected = FALSE;
|
75
|
+
}
|
83
76
|
|
84
|
-
|
85
|
-
rb_ots_free_article(self);
|
86
|
-
rb_raise(eLoadError, "Could not find dictionary file: %s", dict_cstr);
|
77
|
+
line_ptr = g_list_next(line_ptr);
|
87
78
|
}
|
88
79
|
|
89
|
-
|
90
|
-
return Qtrue;
|
80
|
+
return summary;
|
91
81
|
}
|
92
82
|
|
93
|
-
VALUE
|
94
|
-
|
95
|
-
|
83
|
+
VALUE article_summarize(VALUE self, VALUE options) {
|
84
|
+
VALUE lines, percent;
|
85
|
+
OtsArticle *article = article_handle(self);
|
96
86
|
|
97
|
-
|
98
|
-
|
99
|
-
OtsArticle *article = get_article(self, TRUE);
|
100
|
-
ots_parse_stream(string_cstr, string_len, article);
|
101
|
-
ots_grade_doc(article);
|
102
|
-
return Qtrue;
|
103
|
-
}
|
87
|
+
if (TYPE(options) != T_HASH)
|
88
|
+
rb_raise(rb_eArgError, "expect an options hash");
|
104
89
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
90
|
+
lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
|
91
|
+
percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
|
92
|
+
|
93
|
+
if (NIL_P(lines) && NIL_P(percent))
|
94
|
+
rb_raise(rb_eArgError, "expect +lines+ or +percent+ to be provided");
|
110
95
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
96
|
+
if (lines != Qnil)
|
97
|
+
ots_highlight_doc_lines(article, NUM2INT(lines));
|
98
|
+
else
|
99
|
+
ots_highlight_doc(article, NUM2INT(percent));
|
100
|
+
|
101
|
+
return article_summary(article, (rb_encoding *)rb_iv_get(self, "@encoding"));
|
115
102
|
}
|
116
103
|
|
117
|
-
VALUE
|
118
|
-
|
119
|
-
|
120
|
-
return rb_string(article->title);
|
121
|
-
else
|
122
|
-
return Qnil;
|
104
|
+
VALUE article_title(VALUE self) {
|
105
|
+
OtsArticle *article = article_handle(self);
|
106
|
+
return (article->title ? rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")) : Qnil);
|
123
107
|
}
|
124
108
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
OtsWordEntery *data = (OtsWordEntery *)words->data;
|
131
|
-
if (data != NULL && strlen(data->word) > 0)
|
132
|
-
rb_ary_push(iwords, rb_string(data->word));
|
133
|
-
words = words->next;
|
134
|
-
}
|
109
|
+
typedef struct {
|
110
|
+
gchar *word; /* the word */
|
111
|
+
gchar *stem; /*stem of the word*/
|
112
|
+
gint occ; /* how many times have we seen this word in the text? */
|
113
|
+
} OtsWordEntry;
|
135
114
|
|
136
|
-
return iwords;
|
137
|
-
}
|
138
115
|
|
139
|
-
VALUE
|
140
|
-
|
141
|
-
|
142
|
-
GList *curr_line = article->lines;
|
143
|
-
VALUE hlt_lines = rb_ary_new();
|
116
|
+
VALUE article_keywords(VALUE self) {
|
117
|
+
OtsArticle *article = article_handle(self);
|
118
|
+
rb_encoding *encoding = (rb_encoding*)rb_iv_get(self, "@encoding");
|
144
119
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
rb_ary_push(hlt_lines, hlt_line);
|
120
|
+
VALUE words = rb_ary_new();
|
121
|
+
GList* word_ptr = article->ImpWords;
|
122
|
+
|
123
|
+
while (word_ptr) {
|
124
|
+
OtsWordEntry *data = (OtsWordEntry *)word_ptr->data;
|
125
|
+
if (data && strlen(data->word) > 0)
|
126
|
+
rb_ary_push(words, rb_enc_str_new2(data->word, encoding));
|
127
|
+
word_ptr = word_ptr->next;
|
154
128
|
}
|
155
|
-
curr_line = g_list_next(curr_line);
|
156
|
-
}
|
157
129
|
|
158
|
-
|
130
|
+
return words;
|
159
131
|
}
|
160
132
|
|
161
|
-
VALUE
|
162
|
-
|
163
|
-
|
164
|
-
|
133
|
+
VALUE ots_parse(int argc, VALUE *argv, VALUE self) {
|
134
|
+
VALUE article = article_allocate(cArticle);
|
135
|
+
article_initialize(argc, argv, article);
|
136
|
+
return article;
|
137
|
+
}
|
165
138
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
139
|
+
VALUE ots_dictionaries(VALUE self) {
|
140
|
+
DIR *dir;
|
141
|
+
struct dirent *entry;
|
142
|
+
VALUE dictionaries = rb_ary_new();
|
143
|
+
|
144
|
+
if ((dir = opendir(DICTIONARY_DIR))) {
|
145
|
+
while ((entry = readdir(dir))) {
|
146
|
+
// entry->d_type is not portable.
|
147
|
+
if (strstr(entry->d_name, ".xml"))
|
148
|
+
rb_ary_push(dictionaries, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
|
149
|
+
}
|
150
|
+
}
|
151
|
+
else {
|
152
|
+
rb_raise(rb_eIOError, "unable to open dictionary directory: %s", strerror(errno));
|
153
|
+
}
|
174
154
|
|
175
|
-
|
176
|
-
|
177
|
-
else if (percent != Qnil)
|
178
|
-
rb_ots_highlight_percent(self, FIX2INT(percent));
|
179
|
-
return rb_ots_get_highlighted_lines(self);
|
155
|
+
closedir(dir);
|
156
|
+
return dictionaries;
|
180
157
|
}
|
181
158
|
|
182
159
|
/* init */
|
183
160
|
|
184
161
|
void Init_ots(void) {
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
rb_define_method(
|
190
|
-
rb_define_method(
|
191
|
-
rb_define_method(
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
162
|
+
mOTS = rb_define_module("OTS");
|
163
|
+
cArticle = rb_define_class_under(mOTS, "Article", rb_cObject);
|
164
|
+
|
165
|
+
rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1);
|
166
|
+
rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1);
|
167
|
+
rb_define_method(cArticle, "title", RUBY_METHOD_FUNC(article_title), 0);
|
168
|
+
rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0);
|
169
|
+
|
170
|
+
rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
|
171
|
+
rb_define_module_function(mOTS, "dictionaries", RUBY_METHOD_FUNC(ots_dictionaries), 0);
|
172
|
+
|
173
|
+
rb_define_alloc_func(cArticle, article_allocate);
|
174
|
+
|
175
|
+
rb_define_const(mOTS, "VERSION", rb_str_new2(RUBY_OTS_VERSION));
|
197
176
|
}
|