ots 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +80 -0
- data/dictionaries/bg.xml +101 -0
- data/dictionaries/ca.xml +141 -0
- data/dictionaries/cs.xml +161 -0
- data/dictionaries/cy.xml +118 -0
- data/dictionaries/da.xml +129 -0
- data/dictionaries/de.xml +354 -0
- data/dictionaries/el.xml +80 -0
- data/dictionaries/en.xml +606 -0
- data/dictionaries/eo.xml +171 -0
- data/dictionaries/es.xml +369 -0
- data/dictionaries/et.xml +172 -0
- data/dictionaries/eu.xml +77 -0
- data/dictionaries/fi.xml +105 -0
- data/dictionaries/fr.xml +199 -0
- data/dictionaries/ga.xml +124 -0
- data/dictionaries/gl.xml +290 -0
- data/dictionaries/he.xml +334 -0
- data/dictionaries/hu.xml +280 -0
- data/dictionaries/ia.xml +97 -0
- data/dictionaries/id.xml +75 -0
- data/dictionaries/is.xml +201 -0
- data/dictionaries/it.xml +206 -0
- data/dictionaries/lv.xml +77 -0
- data/dictionaries/mi.xml +76 -0
- data/dictionaries/ms.xml +160 -0
- data/dictionaries/mt.xml +73 -0
- data/dictionaries/nl.xml +245 -0
- data/dictionaries/nn.xml +264 -0
- data/dictionaries/pl.xml +92 -0
- data/dictionaries/pt.xml +365 -0
- data/dictionaries/ro.xml +163 -0
- data/dictionaries/ru.xml +150 -0
- data/dictionaries/sv.xml +255 -0
- data/dictionaries/tl.xml +67 -0
- data/dictionaries/tr.xml +65 -0
- data/dictionaries/uk.xml +98 -0
- data/dictionaries/yi.xml +293 -0
- data/ext/article.c +119 -0
- data/ext/dictionary.c +335 -0
- data/ext/extconf.rb +13 -14
- data/ext/grader-tc.c +185 -0
- data/ext/grader-tc.h +64 -0
- data/ext/grader-tf.c +116 -0
- data/ext/grader.c +85 -0
- data/ext/highlighter.c +128 -0
- data/ext/html.c +131 -0
- data/ext/libots.h +158 -0
- data/ext/ots.c +130 -151
- data/ext/ots.h +15 -0
- data/ext/parser.c +173 -0
- data/ext/relations.c +163 -0
- data/ext/stemmer.c +332 -0
- data/ext/text.c +98 -0
- data/ext/version.h +2 -0
- data/ext/wordlist.c +220 -0
- data/test/helper.rb +3 -0
- data/test/test_article.rb +52 -0
- data/test/test_ots.rb +23 -0
- metadata +122 -38
- data/README +0 -25
- data/VERSION +0 -1
- data/lib/ots.rb +0 -1
- data/test/ots_test.rb +0 -62
data/ext/ots.h
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <ruby.h>
|
4
|
+
#include <ruby/encoding.h>
|
5
|
+
|
6
|
+
#include <stdio.h>
|
7
|
+
#include <stdlib.h>
|
8
|
+
#include <string.h>
|
9
|
+
|
10
|
+
#include <libots.h>
|
11
|
+
#include "version.h"
|
12
|
+
|
13
|
+
#define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
|
14
|
+
#define CSTRING(v) RSTRING_PTR(TO_S(v))
|
15
|
+
#define rb_enc_str_new2(text, enc) rb_enc_str_new(text, strlen(text), enc)
|
data/ext/parser.c
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
/*
|
2
|
+
* parser.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
#include <strings.h>
|
25
|
+
#include "libots.h"
|
26
|
+
|
27
|
+
#define BUFFER_SIZE (1024*8)
|
28
|
+
|
29
|
+
int
|
30
|
+
ots_match_post (const char *aWord,const char *post)
|
31
|
+
{
|
32
|
+
int i, wlen, plen;
|
33
|
+
|
34
|
+
|
35
|
+
wlen = strlen (aWord);
|
36
|
+
plen = strlen (post);
|
37
|
+
|
38
|
+
if (plen > wlen) return 0;
|
39
|
+
|
40
|
+
for (i = 0; i < plen; i++)
|
41
|
+
if (aWord[wlen - plen + i] != post[i])
|
42
|
+
return 0; /* no match */
|
43
|
+
|
44
|
+
return 1; /*word match */
|
45
|
+
}
|
46
|
+
|
47
|
+
void
|
48
|
+
ots_parse_file (FILE * stream, OtsArticle * Doc )
|
49
|
+
{
|
50
|
+
unsigned char fread_buffer[BUFFER_SIZE];
|
51
|
+
unsigned char *buffer;
|
52
|
+
size_t nread, total_read, avail_size;
|
53
|
+
|
54
|
+
buffer = g_new0 (unsigned char, BUFFER_SIZE);
|
55
|
+
|
56
|
+
avail_size = BUFFER_SIZE;
|
57
|
+
total_read = nread = 0;
|
58
|
+
while ((nread =
|
59
|
+
fread (fread_buffer, sizeof (unsigned char), sizeof (fread_buffer),
|
60
|
+
stream)) > 0)
|
61
|
+
{
|
62
|
+
if (nread + total_read > avail_size)
|
63
|
+
{
|
64
|
+
avail_size *= 2;
|
65
|
+
buffer = g_renew (unsigned char, buffer, avail_size);
|
66
|
+
}
|
67
|
+
|
68
|
+
strncpy (buffer + total_read, fread_buffer, nread);
|
69
|
+
total_read += nread;
|
70
|
+
}
|
71
|
+
|
72
|
+
ots_parse_stream (buffer, total_read, Doc);
|
73
|
+
g_free (buffer);
|
74
|
+
}
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
int
|
81
|
+
ots_parser_should_break(const char *aWord,const OtsStemRule * rule)
|
82
|
+
{
|
83
|
+
GList *li;
|
84
|
+
char *postfix;
|
85
|
+
int toBreak=0;
|
86
|
+
|
87
|
+
for (li = (GList *) rule->ParserBreak; li != NULL; li = li->next)
|
88
|
+
{
|
89
|
+
postfix=li->data;
|
90
|
+
if (ots_match_post (aWord, postfix) )
|
91
|
+
{
|
92
|
+
toBreak=1;
|
93
|
+
break;
|
94
|
+
}
|
95
|
+
|
96
|
+
}
|
97
|
+
|
98
|
+
|
99
|
+
for (li = (GList *) rule->ParserDontBreak; li != NULL; li = li->next)
|
100
|
+
{
|
101
|
+
postfix=li->data;
|
102
|
+
if (ots_match_post (aWord, postfix) )
|
103
|
+
{
|
104
|
+
toBreak=0;
|
105
|
+
break;
|
106
|
+
}
|
107
|
+
|
108
|
+
}
|
109
|
+
return toBreak;
|
110
|
+
}
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
void
|
115
|
+
ots_parse_stream(const unsigned char *utf8, size_t len, OtsArticle * Doc) /*parse the unicode stream */
|
116
|
+
{
|
117
|
+
|
118
|
+
OtsSentence *tmpLine = ots_append_line (Doc);
|
119
|
+
OtsStemRule * rule=Doc->stem;
|
120
|
+
gunichar uc;
|
121
|
+
int index = 0;
|
122
|
+
char *s = (char *) utf8;
|
123
|
+
GString *word_buffer = g_string_new (NULL);
|
124
|
+
|
125
|
+
|
126
|
+
while ((*s) && (index < len))
|
127
|
+
{
|
128
|
+
uc = g_utf8_get_char (s);
|
129
|
+
|
130
|
+
if (!g_unichar_isspace (uc)) /* space is the end of a word */
|
131
|
+
{
|
132
|
+
|
133
|
+
g_string_append_unichar(word_buffer,uc);
|
134
|
+
|
135
|
+
}
|
136
|
+
else
|
137
|
+
{
|
138
|
+
|
139
|
+
if (0<word_buffer->len)
|
140
|
+
{
|
141
|
+
ots_append_word (tmpLine, word_buffer->str);
|
142
|
+
|
143
|
+
if (ots_parser_should_break(word_buffer->str,rule)) {
|
144
|
+
tmpLine = ots_append_line (Doc); /* Add a new Line */
|
145
|
+
}
|
146
|
+
|
147
|
+
g_string_assign (word_buffer, "");
|
148
|
+
|
149
|
+
}
|
150
|
+
|
151
|
+
if (uc=='\n') {ots_append_word (tmpLine,"\n");}
|
152
|
+
else
|
153
|
+
{ots_append_word (tmpLine," ");}
|
154
|
+
|
155
|
+
g_string_assign (word_buffer,"");
|
156
|
+
}
|
157
|
+
|
158
|
+
s = g_utf8_next_char (s);
|
159
|
+
|
160
|
+
index++;
|
161
|
+
}
|
162
|
+
|
163
|
+
|
164
|
+
if (0<word_buffer->len) /*final flush*/
|
165
|
+
{
|
166
|
+
ots_append_word (tmpLine, word_buffer->str);
|
167
|
+
g_string_assign (word_buffer, "");
|
168
|
+
}
|
169
|
+
|
170
|
+
|
171
|
+
|
172
|
+
g_string_free (word_buffer, TRUE);
|
173
|
+
}
|
data/ext/relations.c
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
/*
|
2
|
+
* relations.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
#include "grader-tc.h"
|
25
|
+
|
26
|
+
#include "libots.h"
|
27
|
+
/*
|
28
|
+
The Inner product of two texts is defined as the number of topics they
|
29
|
+
share. This set of functions implements this relations using the ots
|
30
|
+
api.
|
31
|
+
|
32
|
+
Application: a relation between a slashdot article and a comment made
|
33
|
+
usage: ots_text_relations(story,"en",comment,"en",n);
|
34
|
+
where n is the max number of most important topics to consider; safe to give a high number (ex: 20);
|
35
|
+
|
36
|
+
returns:
|
37
|
+
0 - off topic
|
38
|
+
n - number of topics they share
|
39
|
+
|
40
|
+
*/
|
41
|
+
|
42
|
+
#define OTS_MAX_TOPIC_WORD_SIZE 256
|
43
|
+
|
44
|
+
/*Returns the number of topics that two blocks of text share*/
|
45
|
+
int ots_text_relations(
|
46
|
+
const unsigned char *text1,const unsigned char *lang_code1,
|
47
|
+
const unsigned char *text2,const unsigned char *lang_code2,const int topic_num)
|
48
|
+
{
|
49
|
+
GList* top1;
|
50
|
+
GList* top2;
|
51
|
+
int score;
|
52
|
+
|
53
|
+
top1=ots_text_stem_list(text1,lang_code1,topic_num);
|
54
|
+
top2=ots_text_stem_list(text2,lang_code2,topic_num);
|
55
|
+
|
56
|
+
score=ots_topic_list_score(top1,top2);
|
57
|
+
|
58
|
+
if (top1){g_list_foreach (top1, (GFunc) g_free, NULL);g_list_free (top1);}
|
59
|
+
if (top2){g_list_foreach (top2, (GFunc) g_free, NULL);g_list_free (top2);}
|
60
|
+
|
61
|
+
return score;
|
62
|
+
}
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
/*For a given text, return the list of the topics*/
|
68
|
+
char* ots_text_topics(
|
69
|
+
const unsigned char *text,const unsigned char *lang_code,int topic_num)
|
70
|
+
{
|
71
|
+
int i;
|
72
|
+
GString *word;
|
73
|
+
unsigned char *str;
|
74
|
+
unsigned char *tmp;
|
75
|
+
OtsArticle *Art;
|
76
|
+
|
77
|
+
if (NULL==text) return NULL;
|
78
|
+
word = g_string_new (NULL);
|
79
|
+
|
80
|
+
Art = ots_new_article ();
|
81
|
+
|
82
|
+
ots_load_xml_dictionary(Art,lang_code); /*Load the dictionary*/
|
83
|
+
if (text!=NULL) ots_parse_stream (text,strlen(text), Art); /* read text , put it in struct Article */
|
84
|
+
ots_grade_doc (Art);
|
85
|
+
|
86
|
+
|
87
|
+
for (i=0;i<=topic_num;i++)
|
88
|
+
{
|
89
|
+
tmp=ots_word_in_list(Art->ImpWords,i);
|
90
|
+
if ((tmp!=NULL)&&(strlen(tmp)>0)) {g_string_append(word,tmp);
|
91
|
+
g_string_append(word," "); }
|
92
|
+
}
|
93
|
+
|
94
|
+
|
95
|
+
str=word->str;
|
96
|
+
g_string_free (word, FALSE);
|
97
|
+
ots_free_article (Art);
|
98
|
+
|
99
|
+
return str;
|
100
|
+
}
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
/*For a given text, return the list of the stemmed topics*/
|
105
|
+
GList* ots_text_stem_list(const unsigned char *text, const unsigned char *lang_code, int topic_num)
|
106
|
+
{
|
107
|
+
int i;
|
108
|
+
GList *topics=NULL;
|
109
|
+
unsigned char *tmp;
|
110
|
+
OtsArticle *Art;
|
111
|
+
|
112
|
+
if (NULL==text) return NULL;
|
113
|
+
|
114
|
+
Art = ots_new_article ();
|
115
|
+
|
116
|
+
ots_load_xml_dictionary(Art,lang_code);
|
117
|
+
if (text!=NULL) ots_parse_stream (text,strlen(text), Art);
|
118
|
+
ots_grade_doc (Art);
|
119
|
+
|
120
|
+
|
121
|
+
for (i=0;i<=topic_num;i++)
|
122
|
+
{
|
123
|
+
tmp=ots_stem_in_list(Art->ImpWords,i);
|
124
|
+
if ((tmp)&&(strlen(tmp)>0))
|
125
|
+
topics=g_list_append(topics,g_strdup(tmp));
|
126
|
+
}
|
127
|
+
|
128
|
+
|
129
|
+
ots_free_article (Art);
|
130
|
+
return topics;
|
131
|
+
}
|
132
|
+
|
133
|
+
/*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
|
134
|
+
int ots_topic_list_score(
|
135
|
+
const GList *topic_list1,
|
136
|
+
const GList *topic_list2)
|
137
|
+
{
|
138
|
+
int count=0;
|
139
|
+
GList *tmplist1;
|
140
|
+
GList *tmplist2;
|
141
|
+
|
142
|
+
if (!(topic_list1)) return 0;
|
143
|
+
if (!(topic_list2)) return 0;
|
144
|
+
|
145
|
+
tmplist1 = g_list_first(topic_list1);
|
146
|
+
while(tmplist1)
|
147
|
+
{
|
148
|
+
tmplist2 = g_list_first(topic_list2);
|
149
|
+
while(tmplist2)
|
150
|
+
{
|
151
|
+
|
152
|
+
if ((tmplist1->data)&&(tmplist2->data)&&(strlen(tmplist2->data)>1))
|
153
|
+
if (0==strncmp(tmplist1->data,tmplist2->data,OTS_MAX_TOPIC_WORD_SIZE))
|
154
|
+
{count++;}
|
155
|
+
|
156
|
+
tmplist2 = g_list_next(tmplist2);
|
157
|
+
}
|
158
|
+
tmplist1 = g_list_next(tmplist1);
|
159
|
+
}
|
160
|
+
|
161
|
+
return count;
|
162
|
+
}
|
163
|
+
|
data/ext/stemmer.c
ADDED
@@ -0,0 +1,332 @@
|
|
1
|
+
/*
|
2
|
+
* stemmer.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
|
22
|
+
#include <stdio.h>
|
23
|
+
#include <stdlib.h>
|
24
|
+
#include <string.h>
|
25
|
+
#include "libots.h"
|
26
|
+
|
27
|
+
#define MAX_PREFIX_SIZE 256
|
28
|
+
|
29
|
+
OtsStemRule *
|
30
|
+
new_stem_rule ()
|
31
|
+
{
|
32
|
+
OtsStemRule *rule = g_new0 (OtsStemRule, 1);
|
33
|
+
return rule;
|
34
|
+
}
|
35
|
+
|
36
|
+
void
|
37
|
+
free_stem_rule (OtsStemRule *rule)
|
38
|
+
{
|
39
|
+
|
40
|
+
if (rule != NULL)
|
41
|
+
{
|
42
|
+
g_list_foreach (rule->RemovePre, (GFunc) g_free, NULL);
|
43
|
+
g_list_free (rule->RemovePre);
|
44
|
+
g_list_foreach (rule->RemovePost, (GFunc) g_free, NULL);
|
45
|
+
g_list_free (rule->RemovePost);
|
46
|
+
|
47
|
+
g_list_foreach (rule->step1_pre, (GFunc) g_free, NULL);
|
48
|
+
g_list_free (rule->step1_pre);
|
49
|
+
g_list_foreach (rule->step1_post, (GFunc) g_free, NULL);
|
50
|
+
g_list_free (rule->step1_post);
|
51
|
+
|
52
|
+
g_list_foreach (rule->synonyms, (GFunc) g_free, NULL);
|
53
|
+
g_list_free (rule->synonyms);
|
54
|
+
g_list_foreach (rule->manual, (GFunc) g_free, NULL);
|
55
|
+
g_list_free (rule->manual);
|
56
|
+
|
57
|
+
g_list_foreach (rule->ParserBreak, (GFunc) g_free, NULL);
|
58
|
+
g_list_free (rule->ParserBreak);
|
59
|
+
g_list_foreach (rule->ParserDontBreak, (GFunc) g_free, NULL);
|
60
|
+
g_list_free (rule->ParserDontBreak);
|
61
|
+
|
62
|
+
g_list_foreach (rule->ReplaceChars, (GFunc) g_free, NULL);
|
63
|
+
g_list_free (rule->ReplaceChars);
|
64
|
+
|
65
|
+
g_free (rule);
|
66
|
+
}
|
67
|
+
return;
|
68
|
+
}
|
69
|
+
|
70
|
+
static void
|
71
|
+
ots_stem_break (unsigned const char *comp,unsigned char *part_a,unsigned char *part_b) /*given already alocated part_a and b */
|
72
|
+
{ /*example "red|blue" */
|
73
|
+
int i, j, clen;
|
74
|
+
i = 0;
|
75
|
+
j = 0;
|
76
|
+
|
77
|
+
if (comp==NULL) return;
|
78
|
+
if (part_a==NULL) return;
|
79
|
+
if (part_b==NULL) return;
|
80
|
+
|
81
|
+
clen = strlen (comp);
|
82
|
+
|
83
|
+
|
84
|
+
part_a[0] = 0;
|
85
|
+
part_b[0] = 0;
|
86
|
+
|
87
|
+
while ((i < clen) && (i < MAX_PREFIX_SIZE) && (comp[i] != '|'))
|
88
|
+
{
|
89
|
+
part_a[i] = comp[i];
|
90
|
+
i++;
|
91
|
+
}
|
92
|
+
part_a[i] = 0;
|
93
|
+
|
94
|
+
i++; /*skip the | mark */
|
95
|
+
while (i < clen && (j < MAX_PREFIX_SIZE))
|
96
|
+
{
|
97
|
+
part_b[j] = comp[i];
|
98
|
+
i++;
|
99
|
+
j++;
|
100
|
+
}
|
101
|
+
part_b[j] = 0;
|
102
|
+
return;
|
103
|
+
}
|
104
|
+
|
105
|
+
|
106
|
+
static unsigned char *
|
107
|
+
ots_stem_remove_pre (unsigned const char *aWord,unsigned const char *pre,unsigned const char *new)
|
108
|
+
{
|
109
|
+
int i, plen, wlen, nlen;
|
110
|
+
unsigned char *new_str = NULL;
|
111
|
+
|
112
|
+
if (aWord==NULL) return NULL;
|
113
|
+
|
114
|
+
plen = strlen (pre);
|
115
|
+
wlen = strlen (aWord);
|
116
|
+
nlen = strlen (new);
|
117
|
+
|
118
|
+
for (i = 0; i < plen; i++)
|
119
|
+
if (aWord[i] != pre[i])
|
120
|
+
return NULL; /*no match */
|
121
|
+
|
122
|
+
new_str = g_new0 (char, wlen + nlen +5);
|
123
|
+
for (i = 0; i <= nlen; i++)
|
124
|
+
new_str[i] = new[i];
|
125
|
+
|
126
|
+
for (i = nlen; i <= nlen + wlen - plen; i++)
|
127
|
+
new_str[i] = aWord[i + plen - nlen];
|
128
|
+
|
129
|
+
new_str[i + 1] = 0;
|
130
|
+
return new_str;
|
131
|
+
}
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
static unsigned char *
|
136
|
+
ots_stem_remove_post (unsigned const char *aWord,unsigned const char *post,unsigned const char *new)
|
137
|
+
{
|
138
|
+
unsigned int i, wlen, plen, nlen;
|
139
|
+
unsigned char *new_str = NULL;
|
140
|
+
|
141
|
+
if ((NULL==aWord)||(NULL==post)||(NULL==new)) return NULL;
|
142
|
+
|
143
|
+
wlen = strlen (aWord);
|
144
|
+
plen = strlen (post);
|
145
|
+
nlen = strlen (new);
|
146
|
+
|
147
|
+
if (plen>wlen) return NULL;
|
148
|
+
|
149
|
+
|
150
|
+
for (i = 0; i < plen; i++)
|
151
|
+
if (aWord[wlen - plen + i]!= post[i])
|
152
|
+
return NULL; /* no match */
|
153
|
+
|
154
|
+
new_str = g_new0 (char, wlen + nlen +5);
|
155
|
+
|
156
|
+
for (i = 0; i <= wlen - plen; i++) /*place word */
|
157
|
+
new_str[i] = aWord[i];
|
158
|
+
|
159
|
+
for (i = 0; i <= nlen; i++) /*place newfix */
|
160
|
+
new_str[wlen - plen + i] = new[i];
|
161
|
+
|
162
|
+
return new_str; /*word replaced */
|
163
|
+
}
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
static unsigned char *
|
168
|
+
ots_stem_replace_word (unsigned const char *aWord,unsigned const char *old,unsigned const char *new)
|
169
|
+
{
|
170
|
+
|
171
|
+
if (aWord==NULL) return NULL;
|
172
|
+
|
173
|
+
if ((aWord)&&(0 == strcmp (aWord, old)))
|
174
|
+
{
|
175
|
+
return g_strdup (new);
|
176
|
+
}
|
177
|
+
else
|
178
|
+
{
|
179
|
+
return NULL;
|
180
|
+
}
|
181
|
+
|
182
|
+
}
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
unsigned char *
|
189
|
+
ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule)
|
190
|
+
{
|
191
|
+
GList *li;
|
192
|
+
unsigned char *rep = NULL;
|
193
|
+
unsigned char *normWord = NULL;
|
194
|
+
|
195
|
+
if (aWord==NULL) return NULL;
|
196
|
+
|
197
|
+
normWord = g_utf8_strdown (aWord, -1); /*lowercase the word */
|
198
|
+
|
199
|
+
char *prefix;
|
200
|
+
char *newfix;
|
201
|
+
|
202
|
+
prefix = g_new0 (char, MAX_PREFIX_SIZE);
|
203
|
+
newfix = g_new0 (char, MAX_PREFIX_SIZE);
|
204
|
+
|
205
|
+
for (li = (GList *) rule->step1_pre; li != NULL; li = li->next)
|
206
|
+
{
|
207
|
+
ots_stem_break (li->data, prefix, newfix);
|
208
|
+
rep = ots_stem_remove_pre (normWord, prefix, newfix);
|
209
|
+
if (NULL != rep)
|
210
|
+
{
|
211
|
+
g_free (normWord);
|
212
|
+
normWord = rep;
|
213
|
+
rep = NULL;
|
214
|
+
}
|
215
|
+
}
|
216
|
+
|
217
|
+
|
218
|
+
for (li = (GList *) rule->step1_post; li != NULL; li = li->next)
|
219
|
+
{
|
220
|
+
ots_stem_break (li->data, prefix, newfix);
|
221
|
+
rep = ots_stem_remove_post(normWord, prefix, newfix);
|
222
|
+
if (NULL != rep)
|
223
|
+
{
|
224
|
+
g_free (normWord);
|
225
|
+
normWord = rep;
|
226
|
+
rep = NULL;
|
227
|
+
}
|
228
|
+
}
|
229
|
+
|
230
|
+
g_free (prefix);
|
231
|
+
g_free (newfix);
|
232
|
+
|
233
|
+
return normWord;
|
234
|
+
}
|
235
|
+
|
236
|
+
|
237
|
+
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
|
243
|
+
unsigned char *
|
244
|
+
ots_stem_strip (unsigned const char *aWord,const OtsStemRule * rule)
|
245
|
+
{
|
246
|
+
GList *li;
|
247
|
+
unsigned char *rep = NULL;
|
248
|
+
|
249
|
+
unsigned char *prefix;
|
250
|
+
unsigned char *newfix;
|
251
|
+
unsigned char *normWord=NULL;
|
252
|
+
|
253
|
+
prefix = g_new0 (char, MAX_PREFIX_SIZE);
|
254
|
+
newfix = g_new0 (char, MAX_PREFIX_SIZE);
|
255
|
+
|
256
|
+
if (aWord==NULL) return NULL;
|
257
|
+
|
258
|
+
normWord = ots_stem_format (aWord,rule);
|
259
|
+
|
260
|
+
|
261
|
+
for (li = (GList *) rule->manual; li != NULL; li = li->next)
|
262
|
+
{
|
263
|
+
ots_stem_break (li->data, prefix, newfix);
|
264
|
+
rep = ots_stem_replace_word (normWord, prefix, newfix);
|
265
|
+
if (NULL != rep)
|
266
|
+
{
|
267
|
+
g_free (normWord);
|
268
|
+
normWord = rep;
|
269
|
+
rep = NULL;
|
270
|
+
break;
|
271
|
+
}
|
272
|
+
}
|
273
|
+
|
274
|
+
|
275
|
+
|
276
|
+
|
277
|
+
for (li = (GList *) rule->RemovePre; li != NULL; li = li->next)
|
278
|
+
{
|
279
|
+
ots_stem_break (li->data, prefix, newfix);
|
280
|
+
rep = ots_stem_remove_pre (normWord, prefix, newfix);
|
281
|
+
if (NULL != rep)
|
282
|
+
{
|
283
|
+
g_free (normWord);
|
284
|
+
normWord = rep;
|
285
|
+
rep = NULL;
|
286
|
+
break;
|
287
|
+
}
|
288
|
+
}
|
289
|
+
|
290
|
+
|
291
|
+
for (li = (GList *) rule->RemovePost; li != NULL; li = li->next)
|
292
|
+
{
|
293
|
+
ots_stem_break (li->data, prefix, newfix);
|
294
|
+
rep = ots_stem_remove_post (normWord, prefix, newfix);
|
295
|
+
if (NULL != rep)
|
296
|
+
{
|
297
|
+
g_free (normWord);
|
298
|
+
normWord = rep;
|
299
|
+
rep = NULL;
|
300
|
+
break;
|
301
|
+
}
|
302
|
+
|
303
|
+
}
|
304
|
+
|
305
|
+
|
306
|
+
for (li = (GList *) rule->synonyms; li != NULL; li = li->next)
|
307
|
+
{
|
308
|
+
ots_stem_break (li->data, prefix, newfix);
|
309
|
+
rep = ots_stem_replace_word (normWord, prefix, newfix);
|
310
|
+
if (NULL != rep)
|
311
|
+
{
|
312
|
+
g_free (normWord);
|
313
|
+
normWord = rep;
|
314
|
+
rep = NULL;
|
315
|
+
break;
|
316
|
+
}
|
317
|
+
}
|
318
|
+
|
319
|
+
|
320
|
+
g_free (prefix);
|
321
|
+
g_free (newfix);
|
322
|
+
|
323
|
+
|
324
|
+
if (strlen(normWord)<3) /*stem is two letter long. thats not right. N(eed)==N(ation) ?*/
|
325
|
+
{
|
326
|
+
g_free(normWord);
|
327
|
+
normWord = ots_stem_format (aWord,rule); /*lowercase the word */
|
328
|
+
}
|
329
|
+
|
330
|
+
|
331
|
+
return normWord;
|
332
|
+
}
|