scws4r 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +13 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +56 -0
- data/Rakefile +20 -0
- data/defaults/dict.utf8.xdb +0 -0
- data/defaults/rules.utf8.ini +291 -0
- data/ext/scws4r/Makefile +267 -0
- data/ext/scws4r/Makefile.am +15 -0
- data/ext/scws4r/charset.c +90 -0
- data/ext/scws4r/charset.h +14 -0
- data/ext/scws4r/config_win32.h +22 -0
- data/ext/scws4r/crc32.c +103 -0
- data/ext/scws4r/crc32.h +13 -0
- data/ext/scws4r/darray.c +35 -0
- data/ext/scws4r/darray.h +22 -0
- data/ext/scws4r/extconf.rb +3 -0
- data/ext/scws4r/lock.c +153 -0
- data/ext/scws4r/lock.h +44 -0
- data/ext/scws4r/pool.c +141 -0
- data/ext/scws4r/pool.h +53 -0
- data/ext/scws4r/rule.c +407 -0
- data/ext/scws4r/rule.h +83 -0
- data/ext/scws4r/scws.c +1581 -0
- data/ext/scws4r/scws.h +118 -0
- data/ext/scws4r/scws4r.c +207 -0
- data/ext/scws4r/scws4r.h +4 -0
- data/ext/scws4r/version.h.in +4 -0
- data/ext/scws4r/xdb.c +636 -0
- data/ext/scws4r/xdb.h +88 -0
- data/ext/scws4r/xdict.c +394 -0
- data/ext/scws4r/xdict.h +73 -0
- data/ext/scws4r/xtree.c +337 -0
- data/ext/scws4r/xtree.h +65 -0
- data/lib/scws4r/version.rb +5 -0
- data/lib/scws4r.rb +15 -0
- data/scws4r.gemspec +30 -0
- data/sig/scws.rbs +4 -0
- data/test.rb +16 -0
- metadata +88 -0
data/ext/scws4r/scws.h
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
/**
|
2
|
+
* @file scws.h (core include)
|
3
|
+
* @author Hightman Mar
|
4
|
+
* @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
|
5
|
+
* $Id$
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef _SCWS_LIBSCWS_20070531_H_
|
9
|
+
#define _SCWS_LIBSCWS_20070531_H_
|
10
|
+
|
11
|
+
#ifdef __cplusplus
|
12
|
+
extern "C" {
|
13
|
+
#endif
|
14
|
+
|
15
|
+
// #include "version.h"
|
16
|
+
#include "rule.h"
|
17
|
+
#include "xdict.h"
|
18
|
+
|
19
|
+
#define SCWS_IGN_SYMBOL 0x01
|
20
|
+
//#define SCWS_SEG_MULTI 0x02
|
21
|
+
//#define SCWS_XDB_USAGE 0x04
|
22
|
+
#define SCWS_DEBUG 0x08
|
23
|
+
#define SCWS_DUALITY 0x10
|
24
|
+
|
25
|
+
/* hightman.070901: multi segment policy */
|
26
|
+
#define SCWS_MULTI_NONE 0x00000 // nothing
|
27
|
+
#define SCWS_MULTI_SHORT 0x01000 // split long words to short words from left to right
|
28
|
+
#define SCWS_MULTI_DUALITY 0x02000 // split every long words(3 chars?) to two chars
|
29
|
+
#define SCWS_MULTI_ZMAIN 0x04000 // split to main single chinese char atr = j|a|n?|v?
|
30
|
+
#define SCWS_MULTI_ZALL 0x08000 // attr = ** , all split to single chars
|
31
|
+
#define SCWS_MULTI_MASK 0xff000 // mask check for multi set
|
32
|
+
|
33
|
+
#define SCWS_ZIS_USED 0x8000000
|
34
|
+
|
35
|
+
#define SCWS_YEA (1)
|
36
|
+
#define SCWS_NA (0)
|
37
|
+
|
38
|
+
/* data structures */
|
39
|
+
typedef struct scws_result *scws_res_t;
|
40
|
+
|
41
|
+
struct scws_result
|
42
|
+
{
|
43
|
+
int off;
|
44
|
+
float idf;
|
45
|
+
unsigned char len;
|
46
|
+
char attr[3];
|
47
|
+
scws_res_t next;
|
48
|
+
};
|
49
|
+
|
50
|
+
typedef struct scws_topword *scws_top_t;
|
51
|
+
|
52
|
+
struct scws_topword
|
53
|
+
{
|
54
|
+
char *word;
|
55
|
+
float weight;
|
56
|
+
short times;
|
57
|
+
char attr[2];
|
58
|
+
scws_top_t next;
|
59
|
+
};
|
60
|
+
|
61
|
+
struct scws_zchar
|
62
|
+
{
|
63
|
+
int start;
|
64
|
+
int end;
|
65
|
+
};
|
66
|
+
|
67
|
+
typedef struct scws_st scws_st, *scws_t;
|
68
|
+
|
69
|
+
struct scws_st
|
70
|
+
{
|
71
|
+
xdict_t d;
|
72
|
+
rule_t r;
|
73
|
+
unsigned char *mblen;
|
74
|
+
unsigned int mode;
|
75
|
+
unsigned char *txt;
|
76
|
+
int zis;
|
77
|
+
int len;
|
78
|
+
int off;
|
79
|
+
int wend;
|
80
|
+
scws_res_t res0;
|
81
|
+
scws_res_t res1;
|
82
|
+
word_t **wmap;
|
83
|
+
struct scws_zchar *zmap;
|
84
|
+
};
|
85
|
+
|
86
|
+
/* api: init the scws handler */
|
87
|
+
scws_t scws_new();
|
88
|
+
void scws_free(scws_t s);
|
89
|
+
/* fork instance for multi-threaded usage, but they shared the dict/rules */
|
90
|
+
scws_t scws_fork(scws_t s);
|
91
|
+
|
92
|
+
/* mode = SCWS_XDICT_XDB | SCWS_XDICT_MEM | SCWS_XDICT_TXT */
|
93
|
+
int scws_add_dict(scws_t s, const char *fpath, int mode);
|
94
|
+
int scws_set_dict(scws_t s, const char *fpath, int mode);
|
95
|
+
void scws_set_charset(scws_t s, const char *cs);
|
96
|
+
void scws_set_rule(scws_t s, const char *fpath);
|
97
|
+
|
98
|
+
/* set ignore symbol or multi segments */
|
99
|
+
void scws_set_ignore(scws_t s, int yes);
|
100
|
+
void scws_set_multi(scws_t s, int mode);
|
101
|
+
void scws_set_debug(scws_t s, int yes);
|
102
|
+
void scws_set_duality(scws_t s, int yes);
|
103
|
+
|
104
|
+
void scws_send_text(scws_t s, const char *text, int len);
|
105
|
+
scws_res_t scws_get_result(scws_t s);
|
106
|
+
void scws_free_result(scws_res_t result);
|
107
|
+
|
108
|
+
scws_top_t scws_get_tops(scws_t s, int limit, char *xattr);
|
109
|
+
void scws_free_tops(scws_top_t tops);
|
110
|
+
|
111
|
+
scws_top_t scws_get_words(scws_t s, char *xattr);
|
112
|
+
int scws_has_word(scws_t s, char *xattr);
|
113
|
+
|
114
|
+
#ifdef __cplusplus
|
115
|
+
}
|
116
|
+
#endif
|
117
|
+
|
118
|
+
#endif
|
data/ext/scws4r/scws4r.c
ADDED
@@ -0,0 +1,207 @@
|
|
1
|
+
#include "scws4r.h"
|
2
|
+
|
3
|
+
int scws4r_min(int x, int y){
|
4
|
+
return (x < y) ? x : y;
|
5
|
+
}
|
6
|
+
|
7
|
+
struct rb_scws_malloc{
|
8
|
+
void *engine;
|
9
|
+
};
|
10
|
+
|
11
|
+
static void
|
12
|
+
rb_scws_free(void *p) {
|
13
|
+
struct rb_scws_malloc *ptr = p;
|
14
|
+
free(ptr->engine);
|
15
|
+
}
|
16
|
+
|
17
|
+
static VALUE
|
18
|
+
rb_scws_alloc(VALUE klass) {
|
19
|
+
VALUE obj;
|
20
|
+
struct rb_scws_malloc *ptr;
|
21
|
+
|
22
|
+
obj = Data_Make_Struct(klass, struct rb_scws_malloc, NULL, rb_scws_free, ptr);
|
23
|
+
|
24
|
+
ptr->engine = NULL;
|
25
|
+
|
26
|
+
return obj;
|
27
|
+
}
|
28
|
+
|
29
|
+
static VALUE
|
30
|
+
rb_scws_init(VALUE self){
|
31
|
+
struct rb_scws_malloc *ptr;
|
32
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
33
|
+
scws_t s;
|
34
|
+
s = scws_new();
|
35
|
+
ptr->engine = s;
|
36
|
+
scws_set_charset(ptr->engine, "utf8");
|
37
|
+
return self;
|
38
|
+
}
|
39
|
+
|
40
|
+
static VALUE
|
41
|
+
rb_scws_set_charset(VALUE self, VALUE r_charset){
|
42
|
+
Check_Type(r_charset, T_STRING);
|
43
|
+
char* charset = RSTRING_PTR(r_charset);
|
44
|
+
struct rb_scws_malloc *ptr;
|
45
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
46
|
+
scws_set_charset(ptr->engine, charset);
|
47
|
+
}
|
48
|
+
|
49
|
+
static VALUE
|
50
|
+
rb_scws_set_dic(VALUE self, VALUE r_path, VALUE r_mode){
|
51
|
+
Check_Type(r_path, T_STRING);
|
52
|
+
Check_Type(r_mode, T_FIXNUM);
|
53
|
+
char* path = RSTRING_PTR(r_path);
|
54
|
+
int mode = NUM2INT(r_mode);
|
55
|
+
struct rb_scws_malloc *ptr;
|
56
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
57
|
+
int ret = scws_set_dict(ptr->engine, path, mode);
|
58
|
+
return ret == 0;
|
59
|
+
}
|
60
|
+
|
61
|
+
static VALUE
|
62
|
+
rb_scws_add_dic(VALUE self, VALUE r_path, VALUE r_mode){
|
63
|
+
Check_Type(r_path, T_STRING);
|
64
|
+
Check_Type(r_mode, T_FIXNUM);
|
65
|
+
char* path = RSTRING_PTR(r_path);
|
66
|
+
int mode = NUM2INT(r_mode);
|
67
|
+
struct rb_scws_malloc *ptr;
|
68
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
69
|
+
int ret = scws_add_dict(ptr->engine, path, mode);
|
70
|
+
return ret == 0;
|
71
|
+
}
|
72
|
+
|
73
|
+
static VALUE
|
74
|
+
rb_scws_set_rule(VALUE self, VALUE r_path){
|
75
|
+
Check_Type(r_path, T_STRING);
|
76
|
+
char* path = RSTRING_PTR(r_path);
|
77
|
+
struct rb_scws_malloc *ptr;
|
78
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
79
|
+
scws_set_rule(ptr->engine, path);
|
80
|
+
}
|
81
|
+
|
82
|
+
static VALUE
|
83
|
+
rb_scws_set_ignore(VALUE self, VALUE r_yes){
|
84
|
+
struct rb_scws_malloc *ptr;
|
85
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
86
|
+
scws_set_rule(ptr->engine, RTEST(r_yes) ? 1 : 0);
|
87
|
+
}
|
88
|
+
|
89
|
+
static VALUE
|
90
|
+
rb_scws_set_multi(VALUE self, VALUE r_mode){
|
91
|
+
Check_Type(r_mode, T_FIXNUM);
|
92
|
+
int mode = NUM2INT(r_mode);
|
93
|
+
struct rb_scws_malloc *ptr;
|
94
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
95
|
+
scws_set_multi(ptr->engine, mode);
|
96
|
+
}
|
97
|
+
|
98
|
+
static VALUE
|
99
|
+
rb_scws_set_duality(VALUE self, VALUE r_yes){
|
100
|
+
struct rb_scws_malloc *ptr;
|
101
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
102
|
+
scws_set_duality(ptr->engine, RTEST(r_yes) ? 1 : 0);
|
103
|
+
}
|
104
|
+
|
105
|
+
static VALUE
|
106
|
+
rb_scws_set_debug(VALUE self, VALUE r_yes){
|
107
|
+
struct rb_scws_malloc *ptr;
|
108
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
109
|
+
scws_set_debug(ptr->engine, RTEST(r_yes) ? 1 : 0);
|
110
|
+
}
|
111
|
+
|
112
|
+
static VALUE
|
113
|
+
rb_scws_send_text(VALUE self, VALUE r_text){
|
114
|
+
Check_Type(r_text, T_STRING);
|
115
|
+
char* text = RSTRING_PTR(r_text);
|
116
|
+
struct rb_scws_malloc *ptr;
|
117
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
118
|
+
scws_send_text(ptr->engine, text, strlen(text));
|
119
|
+
scws_res_t res, cur;
|
120
|
+
VALUE result;
|
121
|
+
result = rb_ary_new();
|
122
|
+
rb_encoding *encoding = rb_utf8_encoding();
|
123
|
+
|
124
|
+
while (res = cur = scws_get_result(ptr->engine))
|
125
|
+
{
|
126
|
+
while (cur != NULL)
|
127
|
+
{
|
128
|
+
VALUE item;
|
129
|
+
VALUE apart;
|
130
|
+
item = rb_hash_new();
|
131
|
+
rb_hash_aset(item, rb_str_new2("offset"), INT2NUM(cur->off));
|
132
|
+
rb_hash_aset(item, rb_str_new2("idf"), rb_float_new(cur->idf));
|
133
|
+
rb_hash_aset(item, rb_str_new2("length"), INT2NUM(cur->len));
|
134
|
+
rb_hash_aset(item, rb_str_new2("text"), rb_enc_str_new(text + cur->off, cur->len, encoding));
|
135
|
+
rb_hash_aset(item, rb_str_new2("attr"), rb_str_new2(cur->attr));
|
136
|
+
rb_ary_push(result, item);
|
137
|
+
cur = cur->next;
|
138
|
+
}
|
139
|
+
scws_free_result(res);
|
140
|
+
}
|
141
|
+
return result;
|
142
|
+
}
|
143
|
+
|
144
|
+
static VALUE
|
145
|
+
rb_scws_get_tops(int argc, VALUE *argv, VALUE self){
|
146
|
+
VALUE r_text, r_limit, r_attrs;
|
147
|
+
rb_scan_args(argc, argv, "21", &r_text, &r_limit, &r_attrs);
|
148
|
+
Check_Type(r_text, T_STRING);
|
149
|
+
Check_Type(r_limit, T_FIXNUM);
|
150
|
+
char* text = RSTRING_PTR(r_text);
|
151
|
+
char limit = NUM2INT(r_limit);
|
152
|
+
char* attrs;
|
153
|
+
if (T_NIL == TYPE(r_attrs)){
|
154
|
+
attrs = NULL;
|
155
|
+
}else{
|
156
|
+
Check_Type(r_attrs, T_STRING);
|
157
|
+
attrs = RSTRING_PTR(r_attrs);
|
158
|
+
}
|
159
|
+
struct rb_scws_malloc *ptr;
|
160
|
+
Data_Get_Struct(self, struct rb_scws_malloc, ptr);
|
161
|
+
scws_send_text(ptr->engine, text, strlen(text));
|
162
|
+
scws_top_t cur;
|
163
|
+
VALUE result;
|
164
|
+
result = rb_ary_new();
|
165
|
+
rb_encoding *encoding = rb_utf8_encoding();
|
166
|
+
|
167
|
+
cur = scws_get_tops(ptr->engine, limit, attrs);
|
168
|
+
while (cur != NULL)
|
169
|
+
{
|
170
|
+
VALUE item;
|
171
|
+
VALUE apart;
|
172
|
+
item = rb_hash_new();
|
173
|
+
rb_hash_aset(item, rb_str_new2("times"), INT2NUM(cur->times));
|
174
|
+
rb_hash_aset(item, rb_str_new2("weight"), rb_float_new(cur->weight));
|
175
|
+
rb_hash_aset(item, rb_str_new2("word"), rb_enc_str_new(cur->word, strlen(cur->word), encoding));
|
176
|
+
rb_hash_aset(item, rb_str_new2("attr"), rb_str_new(cur->attr, scws4r_min(strlen(cur->attr), 2)));
|
177
|
+
rb_ary_push(result, item);
|
178
|
+
cur = cur->next;
|
179
|
+
}
|
180
|
+
scws_free_tops(cur);
|
181
|
+
return result;
|
182
|
+
}
|
183
|
+
|
184
|
+
void Init_scws4r(void){
|
185
|
+
VALUE cScws;
|
186
|
+
|
187
|
+
cScws = rb_define_class("Scws4r", rb_cObject);
|
188
|
+
rb_define_alloc_func(cScws, rb_scws_alloc);
|
189
|
+
rb_define_method(cScws, "initialize", rb_scws_init, 0);
|
190
|
+
rb_define_method(cScws, "charset=", rb_scws_set_charset, 1);
|
191
|
+
rb_define_method(cScws, "set_dic", rb_scws_set_dic, 2);
|
192
|
+
rb_define_method(cScws, "add_dic", rb_scws_add_dic, 2);
|
193
|
+
rb_define_method(cScws, "set_rule", rb_scws_set_rule, 1);
|
194
|
+
rb_define_method(cScws, "punctuation_ignore=", rb_scws_set_ignore, 1);
|
195
|
+
rb_define_method(cScws, "multi=", rb_scws_set_multi, 1);
|
196
|
+
rb_define_method(cScws, "duality=", rb_scws_set_duality, 1);
|
197
|
+
rb_define_method(cScws, "debug=", rb_scws_set_debug, 1);
|
198
|
+
rb_define_method(cScws, "split", rb_scws_send_text, 1);
|
199
|
+
rb_define_method(cScws, "tops", rb_scws_get_tops, -1);
|
200
|
+
rb_define_const(cScws, "XDICT_XDB", INT2FIX(SCWS_XDICT_XDB));
|
201
|
+
rb_define_const(cScws, "XDICT_MEM", INT2FIX(SCWS_XDICT_MEM));
|
202
|
+
rb_define_const(cScws, "XDICT_TXT", INT2FIX(SCWS_XDICT_TXT));
|
203
|
+
rb_define_const(cScws, "MULTI_SHORT", INT2FIX(SCWS_MULTI_SHORT));
|
204
|
+
rb_define_const(cScws, "MULTI_DUALITY", INT2FIX(SCWS_MULTI_DUALITY));
|
205
|
+
rb_define_const(cScws, "MULTI_ZMAIN", INT2FIX(SCWS_MULTI_ZMAIN));
|
206
|
+
rb_define_const(cScws, "MULTI_ZALL", INT2FIX(SCWS_MULTI_ZALL));
|
207
|
+
}
|
data/ext/scws4r/scws4r.h
ADDED