scws4r 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/ext/scws4r/scws.h ADDED
@@ -0,0 +1,118 @@
1
+ /**
2
+ * @file scws.h (core include)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifndef _SCWS_LIBSCWS_20070531_H_
9
+ #define _SCWS_LIBSCWS_20070531_H_
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ // #include "version.h"
16
+ #include "rule.h"
17
+ #include "xdict.h"
18
+
19
+ #define SCWS_IGN_SYMBOL 0x01
20
+ //#define SCWS_SEG_MULTI 0x02
21
+ //#define SCWS_XDB_USAGE 0x04
22
+ #define SCWS_DEBUG 0x08
23
+ #define SCWS_DUALITY 0x10
24
+
25
+ /* hightman.070901: multi segment policy */
26
+ #define SCWS_MULTI_NONE 0x00000 // nothing
27
+ #define SCWS_MULTI_SHORT 0x01000 // split long words to short words from left to right
28
+ #define SCWS_MULTI_DUALITY 0x02000 // split every long words(3 chars?) to two chars
29
+ #define SCWS_MULTI_ZMAIN 0x04000 // split to main single chinese char atr = j|a|n?|v?
30
+ #define SCWS_MULTI_ZALL 0x08000 // attr = ** , all split to single chars
31
+ #define SCWS_MULTI_MASK 0xff000 // mask check for multi set
32
+
33
+ #define SCWS_ZIS_USED 0x8000000
34
+
35
+ #define SCWS_YEA (1)
36
+ #define SCWS_NA (0)
37
+
38
+ /* data structures */
39
+ typedef struct scws_result *scws_res_t;
40
+
41
+ struct scws_result
42
+ {
43
+ int off;
44
+ float idf;
45
+ unsigned char len;
46
+ char attr[3];
47
+ scws_res_t next;
48
+ };
49
+
50
+ typedef struct scws_topword *scws_top_t;
51
+
52
+ struct scws_topword
53
+ {
54
+ char *word;
55
+ float weight;
56
+ short times;
57
+ char attr[2];
58
+ scws_top_t next;
59
+ };
60
+
61
+ struct scws_zchar
62
+ {
63
+ int start;
64
+ int end;
65
+ };
66
+
67
+ typedef struct scws_st scws_st, *scws_t;
68
+
69
+ struct scws_st
70
+ {
71
+ xdict_t d;
72
+ rule_t r;
73
+ unsigned char *mblen;
74
+ unsigned int mode;
75
+ unsigned char *txt;
76
+ int zis;
77
+ int len;
78
+ int off;
79
+ int wend;
80
+ scws_res_t res0;
81
+ scws_res_t res1;
82
+ word_t **wmap;
83
+ struct scws_zchar *zmap;
84
+ };
85
+
86
+ /* api: init the scws handler */
87
+ scws_t scws_new();
88
+ void scws_free(scws_t s);
89
+ /* fork instance for multi-threaded usage, but they shared the dict/rules */
90
+ scws_t scws_fork(scws_t s);
91
+
92
+ /* mode = SCWS_XDICT_XDB | SCWS_XDICT_MEM | SCWS_XDICT_TXT */
93
+ int scws_add_dict(scws_t s, const char *fpath, int mode);
94
+ int scws_set_dict(scws_t s, const char *fpath, int mode);
95
+ void scws_set_charset(scws_t s, const char *cs);
96
+ void scws_set_rule(scws_t s, const char *fpath);
97
+
98
+ /* set ignore symbol or multi segments */
99
+ void scws_set_ignore(scws_t s, int yes);
100
+ void scws_set_multi(scws_t s, int mode);
101
+ void scws_set_debug(scws_t s, int yes);
102
+ void scws_set_duality(scws_t s, int yes);
103
+
104
+ void scws_send_text(scws_t s, const char *text, int len);
105
+ scws_res_t scws_get_result(scws_t s);
106
+ void scws_free_result(scws_res_t result);
107
+
108
+ scws_top_t scws_get_tops(scws_t s, int limit, char *xattr);
109
+ void scws_free_tops(scws_top_t tops);
110
+
111
+ scws_top_t scws_get_words(scws_t s, char *xattr);
112
+ int scws_has_word(scws_t s, char *xattr);
113
+
114
+ #ifdef __cplusplus
115
+ }
116
+ #endif
117
+
118
+ #endif
@@ -0,0 +1,207 @@
1
+ #include "scws4r.h"
2
+
3
+ int scws4r_min(int x, int y){
4
+ return (x < y) ? x : y;
5
+ }
6
+
7
+ struct rb_scws_malloc{
8
+ void *engine;
9
+ };
10
+
11
+ static void
12
+ rb_scws_free(void *p) {
13
+ struct rb_scws_malloc *ptr = p;
14
+ free(ptr->engine);
15
+ }
16
+
17
+ static VALUE
18
+ rb_scws_alloc(VALUE klass) {
19
+ VALUE obj;
20
+ struct rb_scws_malloc *ptr;
21
+
22
+ obj = Data_Make_Struct(klass, struct rb_scws_malloc, NULL, rb_scws_free, ptr);
23
+
24
+ ptr->engine = NULL;
25
+
26
+ return obj;
27
+ }
28
+
29
+ static VALUE
30
+ rb_scws_init(VALUE self){
31
+ struct rb_scws_malloc *ptr;
32
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
33
+ scws_t s;
34
+ s = scws_new();
35
+ ptr->engine = s;
36
+ scws_set_charset(ptr->engine, "utf8");
37
+ return self;
38
+ }
39
+
40
+ static VALUE
41
+ rb_scws_set_charset(VALUE self, VALUE r_charset){
42
+ Check_Type(r_charset, T_STRING);
43
+ char* charset = RSTRING_PTR(r_charset);
44
+ struct rb_scws_malloc *ptr;
45
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
46
+ scws_set_charset(ptr->engine, charset);
47
+ }
48
+
49
+ static VALUE
50
+ rb_scws_set_dic(VALUE self, VALUE r_path, VALUE r_mode){
51
+ Check_Type(r_path, T_STRING);
52
+ Check_Type(r_mode, T_FIXNUM);
53
+ char* path = RSTRING_PTR(r_path);
54
+ int mode = NUM2INT(r_mode);
55
+ struct rb_scws_malloc *ptr;
56
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
57
+ int ret = scws_set_dict(ptr->engine, path, mode);
58
+ return ret == 0;
59
+ }
60
+
61
+ static VALUE
62
+ rb_scws_add_dic(VALUE self, VALUE r_path, VALUE r_mode){
63
+ Check_Type(r_path, T_STRING);
64
+ Check_Type(r_mode, T_FIXNUM);
65
+ char* path = RSTRING_PTR(r_path);
66
+ int mode = NUM2INT(r_mode);
67
+ struct rb_scws_malloc *ptr;
68
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
69
+ int ret = scws_add_dict(ptr->engine, path, mode);
70
+ return ret == 0;
71
+ }
72
+
73
+ static VALUE
74
+ rb_scws_set_rule(VALUE self, VALUE r_path){
75
+ Check_Type(r_path, T_STRING);
76
+ char* path = RSTRING_PTR(r_path);
77
+ struct rb_scws_malloc *ptr;
78
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
79
+ scws_set_rule(ptr->engine, path);
80
+ }
81
+
82
+ static VALUE
83
+ rb_scws_set_ignore(VALUE self, VALUE r_yes){
84
+ struct rb_scws_malloc *ptr;
85
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
86
+ scws_set_rule(ptr->engine, RTEST(r_yes) ? 1 : 0);
87
+ }
88
+
89
+ static VALUE
90
+ rb_scws_set_multi(VALUE self, VALUE r_mode){
91
+ Check_Type(r_mode, T_FIXNUM);
92
+ int mode = NUM2INT(r_mode);
93
+ struct rb_scws_malloc *ptr;
94
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
95
+ scws_set_multi(ptr->engine, mode);
96
+ }
97
+
98
+ static VALUE
99
+ rb_scws_set_duality(VALUE self, VALUE r_yes){
100
+ struct rb_scws_malloc *ptr;
101
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
102
+ scws_set_duality(ptr->engine, RTEST(r_yes) ? 1 : 0);
103
+ }
104
+
105
+ static VALUE
106
+ rb_scws_set_debug(VALUE self, VALUE r_yes){
107
+ struct rb_scws_malloc *ptr;
108
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
109
+ scws_set_debug(ptr->engine, RTEST(r_yes) ? 1 : 0);
110
+ }
111
+
112
+ static VALUE
113
+ rb_scws_send_text(VALUE self, VALUE r_text){
114
+ Check_Type(r_text, T_STRING);
115
+ char* text = RSTRING_PTR(r_text);
116
+ struct rb_scws_malloc *ptr;
117
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
118
+ scws_send_text(ptr->engine, text, strlen(text));
119
+ scws_res_t res, cur;
120
+ VALUE result;
121
+ result = rb_ary_new();
122
+ rb_encoding *encoding = rb_utf8_encoding();
123
+
124
+ while (res = cur = scws_get_result(ptr->engine))
125
+ {
126
+ while (cur != NULL)
127
+ {
128
+ VALUE item;
129
+ VALUE apart;
130
+ item = rb_hash_new();
131
+ rb_hash_aset(item, rb_str_new2("offset"), INT2NUM(cur->off));
132
+ rb_hash_aset(item, rb_str_new2("idf"), rb_float_new(cur->idf));
133
+ rb_hash_aset(item, rb_str_new2("length"), INT2NUM(cur->len));
134
+ rb_hash_aset(item, rb_str_new2("text"), rb_enc_str_new(text + cur->off, cur->len, encoding));
135
+ rb_hash_aset(item, rb_str_new2("attr"), rb_str_new2(cur->attr));
136
+ rb_ary_push(result, item);
137
+ cur = cur->next;
138
+ }
139
+ scws_free_result(res);
140
+ }
141
+ return result;
142
+ }
143
+
144
+ static VALUE
145
+ rb_scws_get_tops(int argc, VALUE *argv, VALUE self){
146
+ VALUE r_text, r_limit, r_attrs;
147
+ rb_scan_args(argc, argv, "21", &r_text, &r_limit, &r_attrs);
148
+ Check_Type(r_text, T_STRING);
149
+ Check_Type(r_limit, T_FIXNUM);
150
+ char* text = RSTRING_PTR(r_text);
151
+ char limit = NUM2INT(r_limit);
152
+ char* attrs;
153
+ if (T_NIL == TYPE(r_attrs)){
154
+ attrs = NULL;
155
+ }else{
156
+ Check_Type(r_attrs, T_STRING);
157
+ attrs = RSTRING_PTR(r_attrs);
158
+ }
159
+ struct rb_scws_malloc *ptr;
160
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
161
+ scws_send_text(ptr->engine, text, strlen(text));
162
+ scws_top_t cur;
163
+ VALUE result;
164
+ result = rb_ary_new();
165
+ rb_encoding *encoding = rb_utf8_encoding();
166
+
167
+ cur = scws_get_tops(ptr->engine, limit, attrs);
168
+ while (cur != NULL)
169
+ {
170
+ VALUE item;
171
+ VALUE apart;
172
+ item = rb_hash_new();
173
+ rb_hash_aset(item, rb_str_new2("times"), INT2NUM(cur->times));
174
+ rb_hash_aset(item, rb_str_new2("weight"), rb_float_new(cur->weight));
175
+ rb_hash_aset(item, rb_str_new2("word"), rb_enc_str_new(cur->word, strlen(cur->word), encoding));
176
+ rb_hash_aset(item, rb_str_new2("attr"), rb_str_new(cur->attr, scws4r_min(strlen(cur->attr), 2)));
177
+ rb_ary_push(result, item);
178
+ cur = cur->next;
179
+ }
180
+ scws_free_tops(cur);
181
+ return result;
182
+ }
183
+
184
+ void Init_scws4r(void){
185
+ VALUE cScws;
186
+
187
+ cScws = rb_define_class("Scws4r", rb_cObject);
188
+ rb_define_alloc_func(cScws, rb_scws_alloc);
189
+ rb_define_method(cScws, "initialize", rb_scws_init, 0);
190
+ rb_define_method(cScws, "charset=", rb_scws_set_charset, 1);
191
+ rb_define_method(cScws, "set_dic", rb_scws_set_dic, 2);
192
+ rb_define_method(cScws, "add_dic", rb_scws_add_dic, 2);
193
+ rb_define_method(cScws, "set_rule", rb_scws_set_rule, 1);
194
+ rb_define_method(cScws, "punctuation_ignore=", rb_scws_set_ignore, 1);
195
+ rb_define_method(cScws, "multi=", rb_scws_set_multi, 1);
196
+ rb_define_method(cScws, "duality=", rb_scws_set_duality, 1);
197
+ rb_define_method(cScws, "debug=", rb_scws_set_debug, 1);
198
+ rb_define_method(cScws, "split", rb_scws_send_text, 1);
199
+ rb_define_method(cScws, "tops", rb_scws_get_tops, -1);
200
+ rb_define_const(cScws, "XDICT_XDB", INT2FIX(SCWS_XDICT_XDB));
201
+ rb_define_const(cScws, "XDICT_MEM", INT2FIX(SCWS_XDICT_MEM));
202
+ rb_define_const(cScws, "XDICT_TXT", INT2FIX(SCWS_XDICT_TXT));
203
+ rb_define_const(cScws, "MULTI_SHORT", INT2FIX(SCWS_MULTI_SHORT));
204
+ rb_define_const(cScws, "MULTI_DUALITY", INT2FIX(SCWS_MULTI_DUALITY));
205
+ rb_define_const(cScws, "MULTI_ZMAIN", INT2FIX(SCWS_MULTI_ZMAIN));
206
+ rb_define_const(cScws, "MULTI_ZALL", INT2FIX(SCWS_MULTI_ZALL));
207
+ }
@@ -0,0 +1,4 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include "scws.h"
4
+ #include "xdict.h"
@@ -0,0 +1,4 @@
1
+ /* version.h.in. input file for configure */
2
+ #define SCWS_VERSION "@VERSION@"
3
+ #define SCWS_BUGREPORT "@PACKAGE_BUGREPORT@"
4
+