scws4r 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/scws4r/scws.h ADDED
@@ -0,0 +1,118 @@
1
+ /**
2
+ * @file scws.h (core include)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifndef _SCWS_LIBSCWS_20070531_H_
9
+ #define _SCWS_LIBSCWS_20070531_H_
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ // #include "version.h"
16
+ #include "rule.h"
17
+ #include "xdict.h"
18
+
19
+ #define SCWS_IGN_SYMBOL 0x01
20
+ //#define SCWS_SEG_MULTI 0x02
21
+ //#define SCWS_XDB_USAGE 0x04
22
+ #define SCWS_DEBUG 0x08
23
+ #define SCWS_DUALITY 0x10
24
+
25
+ /* hightman.070901: multi segment policy */
26
+ #define SCWS_MULTI_NONE 0x00000 // nothing
27
+ #define SCWS_MULTI_SHORT 0x01000 // split long words to short words from left to right
28
+ #define SCWS_MULTI_DUALITY 0x02000 // split every long words(3 chars?) to two chars
29
+ #define SCWS_MULTI_ZMAIN 0x04000 // split to main single chinese char atr = j|a|n?|v?
30
+ #define SCWS_MULTI_ZALL 0x08000 // attr = ** , all split to single chars
31
+ #define SCWS_MULTI_MASK 0xff000 // mask check for multi set
32
+
33
+ #define SCWS_ZIS_USED 0x8000000
34
+
35
+ #define SCWS_YEA (1)
36
+ #define SCWS_NA (0)
37
+
38
+ /* data structures */
39
+ typedef struct scws_result *scws_res_t;
40
+
41
+ struct scws_result
42
+ {
43
+ int off;
44
+ float idf;
45
+ unsigned char len;
46
+ char attr[3];
47
+ scws_res_t next;
48
+ };
49
+
50
+ typedef struct scws_topword *scws_top_t;
51
+
52
+ struct scws_topword
53
+ {
54
+ char *word;
55
+ float weight;
56
+ short times;
57
+ char attr[2];
58
+ scws_top_t next;
59
+ };
60
+
61
+ struct scws_zchar
62
+ {
63
+ int start;
64
+ int end;
65
+ };
66
+
67
+ typedef struct scws_st scws_st, *scws_t;
68
+
69
+ struct scws_st
70
+ {
71
+ xdict_t d;
72
+ rule_t r;
73
+ unsigned char *mblen;
74
+ unsigned int mode;
75
+ unsigned char *txt;
76
+ int zis;
77
+ int len;
78
+ int off;
79
+ int wend;
80
+ scws_res_t res0;
81
+ scws_res_t res1;
82
+ word_t **wmap;
83
+ struct scws_zchar *zmap;
84
+ };
85
+
86
+ /* api: init the scws handler */
87
+ scws_t scws_new();
88
+ void scws_free(scws_t s);
89
+ /* fork instance for multi-threaded usage, but they shared the dict/rules */
90
+ scws_t scws_fork(scws_t s);
91
+
92
+ /* mode = SCWS_XDICT_XDB | SCWS_XDICT_MEM | SCWS_XDICT_TXT */
93
+ int scws_add_dict(scws_t s, const char *fpath, int mode);
94
+ int scws_set_dict(scws_t s, const char *fpath, int mode);
95
+ void scws_set_charset(scws_t s, const char *cs);
96
+ void scws_set_rule(scws_t s, const char *fpath);
97
+
98
+ /* set ignore symbol or multi segments */
99
+ void scws_set_ignore(scws_t s, int yes);
100
+ void scws_set_multi(scws_t s, int mode);
101
+ void scws_set_debug(scws_t s, int yes);
102
+ void scws_set_duality(scws_t s, int yes);
103
+
104
+ void scws_send_text(scws_t s, const char *text, int len);
105
+ scws_res_t scws_get_result(scws_t s);
106
+ void scws_free_result(scws_res_t result);
107
+
108
+ scws_top_t scws_get_tops(scws_t s, int limit, char *xattr);
109
+ void scws_free_tops(scws_top_t tops);
110
+
111
+ scws_top_t scws_get_words(scws_t s, char *xattr);
112
+ int scws_has_word(scws_t s, char *xattr);
113
+
114
+ #ifdef __cplusplus
115
+ }
116
+ #endif
117
+
118
+ #endif
@@ -0,0 +1,207 @@
1
+ #include "scws4r.h"
2
+
3
+ int scws4r_min(int x, int y){
4
+ return (x < y) ? x : y;
5
+ }
6
+
7
+ struct rb_scws_malloc{
8
+ void *engine;
9
+ };
10
+
11
+ static void
12
+ rb_scws_free(void *p) {
13
+ struct rb_scws_malloc *ptr = p;
14
+ free(ptr->engine);
15
+ }
16
+
17
+ static VALUE
18
+ rb_scws_alloc(VALUE klass) {
19
+ VALUE obj;
20
+ struct rb_scws_malloc *ptr;
21
+
22
+ obj = Data_Make_Struct(klass, struct rb_scws_malloc, NULL, rb_scws_free, ptr);
23
+
24
+ ptr->engine = NULL;
25
+
26
+ return obj;
27
+ }
28
+
29
+ static VALUE
30
+ rb_scws_init(VALUE self){
31
+ struct rb_scws_malloc *ptr;
32
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
33
+ scws_t s;
34
+ s = scws_new();
35
+ ptr->engine = s;
36
+ scws_set_charset(ptr->engine, "utf8");
37
+ return self;
38
+ }
39
+
40
+ static VALUE
41
+ rb_scws_set_charset(VALUE self, VALUE r_charset){
42
+ Check_Type(r_charset, T_STRING);
43
+ char* charset = RSTRING_PTR(r_charset);
44
+ struct rb_scws_malloc *ptr;
45
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
46
+ scws_set_charset(ptr->engine, charset);
47
+ }
48
+
49
+ static VALUE
50
+ rb_scws_set_dic(VALUE self, VALUE r_path, VALUE r_mode){
51
+ Check_Type(r_path, T_STRING);
52
+ Check_Type(r_mode, T_FIXNUM);
53
+ char* path = RSTRING_PTR(r_path);
54
+ int mode = NUM2INT(r_mode);
55
+ struct rb_scws_malloc *ptr;
56
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
57
+ int ret = scws_set_dict(ptr->engine, path, mode);
58
+ return ret == 0;
59
+ }
60
+
61
+ static VALUE
62
+ rb_scws_add_dic(VALUE self, VALUE r_path, VALUE r_mode){
63
+ Check_Type(r_path, T_STRING);
64
+ Check_Type(r_mode, T_FIXNUM);
65
+ char* path = RSTRING_PTR(r_path);
66
+ int mode = NUM2INT(r_mode);
67
+ struct rb_scws_malloc *ptr;
68
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
69
+ int ret = scws_add_dict(ptr->engine, path, mode);
70
+ return ret == 0;
71
+ }
72
+
73
+ static VALUE
74
+ rb_scws_set_rule(VALUE self, VALUE r_path){
75
+ Check_Type(r_path, T_STRING);
76
+ char* path = RSTRING_PTR(r_path);
77
+ struct rb_scws_malloc *ptr;
78
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
79
+ scws_set_rule(ptr->engine, path);
80
+ }
81
+
82
+ static VALUE
83
+ rb_scws_set_ignore(VALUE self, VALUE r_yes){
84
+ struct rb_scws_malloc *ptr;
85
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
86
+ scws_set_rule(ptr->engine, RTEST(r_yes) ? 1 : 0);
87
+ }
88
+
89
+ static VALUE
90
+ rb_scws_set_multi(VALUE self, VALUE r_mode){
91
+ Check_Type(r_mode, T_FIXNUM);
92
+ int mode = NUM2INT(r_mode);
93
+ struct rb_scws_malloc *ptr;
94
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
95
+ scws_set_multi(ptr->engine, mode);
96
+ }
97
+
98
+ static VALUE
99
+ rb_scws_set_duality(VALUE self, VALUE r_yes){
100
+ struct rb_scws_malloc *ptr;
101
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
102
+ scws_set_duality(ptr->engine, RTEST(r_yes) ? 1 : 0);
103
+ }
104
+
105
+ static VALUE
106
+ rb_scws_set_debug(VALUE self, VALUE r_yes){
107
+ struct rb_scws_malloc *ptr;
108
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
109
+ scws_set_debug(ptr->engine, RTEST(r_yes) ? 1 : 0);
110
+ }
111
+
112
+ static VALUE
113
+ rb_scws_send_text(VALUE self, VALUE r_text){
114
+ Check_Type(r_text, T_STRING);
115
+ char* text = RSTRING_PTR(r_text);
116
+ struct rb_scws_malloc *ptr;
117
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
118
+ scws_send_text(ptr->engine, text, strlen(text));
119
+ scws_res_t res, cur;
120
+ VALUE result;
121
+ result = rb_ary_new();
122
+ rb_encoding *encoding = rb_utf8_encoding();
123
+
124
+ while (res = cur = scws_get_result(ptr->engine))
125
+ {
126
+ while (cur != NULL)
127
+ {
128
+ VALUE item;
129
+ VALUE apart;
130
+ item = rb_hash_new();
131
+ rb_hash_aset(item, rb_str_new2("offset"), INT2NUM(cur->off));
132
+ rb_hash_aset(item, rb_str_new2("idf"), rb_float_new(cur->idf));
133
+ rb_hash_aset(item, rb_str_new2("length"), INT2NUM(cur->len));
134
+ rb_hash_aset(item, rb_str_new2("text"), rb_enc_str_new(text + cur->off, cur->len, encoding));
135
+ rb_hash_aset(item, rb_str_new2("attr"), rb_str_new2(cur->attr));
136
+ rb_ary_push(result, item);
137
+ cur = cur->next;
138
+ }
139
+ scws_free_result(res);
140
+ }
141
+ return result;
142
+ }
143
+
144
+ static VALUE
145
+ rb_scws_get_tops(int argc, VALUE *argv, VALUE self){
146
+ VALUE r_text, r_limit, r_attrs;
147
+ rb_scan_args(argc, argv, "21", &r_text, &r_limit, &r_attrs);
148
+ Check_Type(r_text, T_STRING);
149
+ Check_Type(r_limit, T_FIXNUM);
150
+ char* text = RSTRING_PTR(r_text);
151
+ char limit = NUM2INT(r_limit);
152
+ char* attrs;
153
+ if (T_NIL == TYPE(r_attrs)){
154
+ attrs = NULL;
155
+ }else{
156
+ Check_Type(r_attrs, T_STRING);
157
+ attrs = RSTRING_PTR(r_attrs);
158
+ }
159
+ struct rb_scws_malloc *ptr;
160
+ Data_Get_Struct(self, struct rb_scws_malloc, ptr);
161
+ scws_send_text(ptr->engine, text, strlen(text));
162
+ scws_top_t cur;
163
+ VALUE result;
164
+ result = rb_ary_new();
165
+ rb_encoding *encoding = rb_utf8_encoding();
166
+
167
+ cur = scws_get_tops(ptr->engine, limit, attrs);
168
+ while (cur != NULL)
169
+ {
170
+ VALUE item;
171
+ VALUE apart;
172
+ item = rb_hash_new();
173
+ rb_hash_aset(item, rb_str_new2("times"), INT2NUM(cur->times));
174
+ rb_hash_aset(item, rb_str_new2("weight"), rb_float_new(cur->weight));
175
+ rb_hash_aset(item, rb_str_new2("word"), rb_enc_str_new(cur->word, strlen(cur->word), encoding));
176
+ rb_hash_aset(item, rb_str_new2("attr"), rb_str_new(cur->attr, scws4r_min(strlen(cur->attr), 2)));
177
+ rb_ary_push(result, item);
178
+ cur = cur->next;
179
+ }
180
+ scws_free_tops(cur);
181
+ return result;
182
+ }
183
+
184
+ void Init_scws4r(void){
185
+ VALUE cScws;
186
+
187
+ cScws = rb_define_class("Scws4r", rb_cObject);
188
+ rb_define_alloc_func(cScws, rb_scws_alloc);
189
+ rb_define_method(cScws, "initialize", rb_scws_init, 0);
190
+ rb_define_method(cScws, "charset=", rb_scws_set_charset, 1);
191
+ rb_define_method(cScws, "set_dic", rb_scws_set_dic, 2);
192
+ rb_define_method(cScws, "add_dic", rb_scws_add_dic, 2);
193
+ rb_define_method(cScws, "set_rule", rb_scws_set_rule, 1);
194
+ rb_define_method(cScws, "punctuation_ignore=", rb_scws_set_ignore, 1);
195
+ rb_define_method(cScws, "multi=", rb_scws_set_multi, 1);
196
+ rb_define_method(cScws, "duality=", rb_scws_set_duality, 1);
197
+ rb_define_method(cScws, "debug=", rb_scws_set_debug, 1);
198
+ rb_define_method(cScws, "split", rb_scws_send_text, 1);
199
+ rb_define_method(cScws, "tops", rb_scws_get_tops, -1);
200
+ rb_define_const(cScws, "XDICT_XDB", INT2FIX(SCWS_XDICT_XDB));
201
+ rb_define_const(cScws, "XDICT_MEM", INT2FIX(SCWS_XDICT_MEM));
202
+ rb_define_const(cScws, "XDICT_TXT", INT2FIX(SCWS_XDICT_TXT));
203
+ rb_define_const(cScws, "MULTI_SHORT", INT2FIX(SCWS_MULTI_SHORT));
204
+ rb_define_const(cScws, "MULTI_DUALITY", INT2FIX(SCWS_MULTI_DUALITY));
205
+ rb_define_const(cScws, "MULTI_ZMAIN", INT2FIX(SCWS_MULTI_ZMAIN));
206
+ rb_define_const(cScws, "MULTI_ZALL", INT2FIX(SCWS_MULTI_ZALL));
207
+ }
@@ -0,0 +1,4 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include "scws.h"
4
+ #include "xdict.h"
@@ -0,0 +1,4 @@
1
+ /* version.h.in. input file for configure */
2
+ #define SCWS_VERSION "@VERSION@"
3
+ #define SCWS_BUGREPORT "@PACKAGE_BUGREPORT@"
4
+