scws4r 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,337 @@
1
+ /**
2
+ * @file xtree.c
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifdef HAVE_CONFIG_H
9
+ # include "config.h"
10
+ #endif
11
+
12
+ #include "xtree.h"
13
+ #include "xdb.h"
14
+ #include <stdio.h>
15
+ #include <stdlib.h>
16
+ #ifndef WIN32
17
+ # include <unistd.h>
18
+ #endif
19
+ #include <string.h>
20
+
21
+ /* private static functions */
22
+ static int _xtree_hasher(xtree_t xt, const char *s, int len)
23
+ {
24
+ unsigned int h = xt->base;
25
+ while (len--)
26
+ {
27
+ h += (h<<5);
28
+ h ^= (unsigned char) s[len];
29
+ h &= 0x7fffffff;
30
+ }
31
+ return (h % xt->prime);
32
+ }
33
+
34
+ static node_t _xtree_node_search(node_t head, node_t **pnode, const char *key, int len)
35
+ {
36
+ int cmp;
37
+
38
+ cmp = memcmp(key, head->key, len);
39
+ if (cmp == 0)
40
+ cmp = len - strlen(head->key);
41
+
42
+ if (cmp != 0)
43
+ {
44
+ node_t *next;
45
+
46
+ next = (cmp > 0 ? &head->right : &head->left);
47
+ if (*next == NULL)
48
+ {
49
+ if (pnode != NULL)
50
+ *pnode = next;
51
+ return NULL;
52
+ }
53
+ return _xtree_node_search(*next, pnode, key, len);
54
+ }
55
+ return head;
56
+ }
57
+
58
+ static node_t _xtree_node_find(xtree_t xt, node_t **pnode, const char *key, int len)
59
+ {
60
+ int i;
61
+ i = (xt->prime > 1 ? _xtree_hasher(xt, key, len) : 0);
62
+ if (xt->trees[i] == NULL)
63
+ {
64
+ if (pnode != NULL)
65
+ *pnode = &xt->trees[i];
66
+ return NULL;
67
+ }
68
+ return _xtree_node_search(xt->trees[i], pnode, key, len);
69
+ }
70
+
71
+ /* public functions */
72
+ xtree_t xtree_new(int base, int prime)
73
+ {
74
+ xtree_t xnew;
75
+ pool_t p;
76
+
77
+ p = pool_new();
78
+ xnew = pmalloc(p, sizeof(xtree_st));
79
+ xnew->p = p;
80
+ xnew->base = (base ? base : 0xf422f);
81
+ xnew->prime = (prime ? prime : 31);
82
+ xnew->count = 0;
83
+ xnew->trees = (node_t *) pmalloc_z(p, sizeof(node_t) * xnew->prime);
84
+ return xnew;
85
+ }
86
+
87
+ void xtree_free(xtree_t xt)
88
+ {
89
+ if (xt)
90
+ pool_free(xt->p);
91
+ }
92
+
93
+ void xtree_nput(xtree_t xt, void *value, int vlen, const char *key, int len)
94
+ {
95
+ node_t node, *pnode;
96
+
97
+ if (xt == NULL || key == NULL || len == 0)
98
+ return;
99
+
100
+ if ((node = _xtree_node_find(xt, &pnode, key, len)) != NULL)
101
+ {
102
+ node->value = value;
103
+ node->vlen = vlen;
104
+ return;
105
+ }
106
+
107
+ if (value != NULL)
108
+ {
109
+ *pnode = node = (node_t) pmalloc(xt->p, sizeof(node_st));
110
+ node->key = pstrndup(xt->p, key, len);
111
+ node->value = value;
112
+ node->vlen = vlen;
113
+ node->left = NULL;
114
+ node->right = NULL;
115
+ }
116
+ }
117
+
118
+ void xtree_put(xtree_t xt, const char *value, const char *key)
119
+ {
120
+ if (xt != NULL && key != NULL)
121
+ xtree_nput(xt, (void *) value, value ? strlen(value) : 0, key, strlen(key));
122
+ }
123
+
124
+ void *xtree_nget(xtree_t xt, const char *key, int len, int *vlen)
125
+ {
126
+ node_t node;
127
+
128
+ if (xt == NULL || key == NULL || len == 0
129
+ || !(node = _xtree_node_find(xt, NULL, key, len)))
130
+ {
131
+ return NULL;
132
+ }
133
+
134
+ if (vlen != NULL)
135
+ *vlen = node->vlen;
136
+ return node->value;
137
+ }
138
+
139
+ void *xtree_get(xtree_t xt, const char *key, int *vlen)
140
+ {
141
+ if (xt == NULL || key == NULL)
142
+ return NULL;
143
+
144
+ return xtree_nget(xt, key, strlen(key), vlen);
145
+ }
146
+
147
+ /*
148
+ void xtree_ndel(xtree_t xt, const char *key, int len)
149
+ {
150
+ xtree_nput(xt, NULL, 0, key, len);
151
+ }
152
+
153
+ void xtree_del(xtree_t xt, const char *key)
154
+ {
155
+ if (xt == NULL || key == NULL)
156
+ return;
157
+
158
+ xtree_ndel(xt, key, strlen(key));
159
+ }
160
+ */
161
+
162
+ #ifdef DEBUG
163
+ /* draw the xtree to stdout */
164
+ struct draw_arg
165
+ {
166
+ int depth;
167
+ int count;
168
+ int flag;
169
+ };
170
+
171
+ static void _xtree_draw_node(node_t node, struct draw_arg *arg, int depth, char *icon1)
172
+ {
173
+ char *icon2;
174
+
175
+ icon2 = malloc(strlen(icon1) + 4);
176
+ strcpy(icon2, icon1);
177
+
178
+ // output the flag & icon
179
+ if (arg->flag == 'T')
180
+ printf("(T) ");
181
+ else
182
+ {
183
+ printf("%s", icon2);
184
+ if (arg->flag == 'L')
185
+ {
186
+ strcat(icon2, " ┃");
187
+ printf(" ┟(L) ");
188
+ }
189
+ else
190
+ {
191
+ strcat(icon2, "  ");
192
+ printf(" └(R) ");
193
+ }
194
+ }
195
+
196
+ // draw the node data
197
+ if (node == NULL)
198
+ printf("<NULL>\n");
199
+ else
200
+ {
201
+ printf("%s (value on 0x%x vlen=%d)\n", node->key, (unsigned int)node->value, node->vlen);
202
+
203
+ arg->count++;
204
+ depth++;
205
+ if (depth > arg->depth)
206
+ arg->depth = depth;
207
+
208
+ // draw the left & right
209
+ arg->flag = 'L';
210
+ _xtree_draw_node(node->left, arg, depth, icon2);
211
+
212
+ arg->flag = 'R';
213
+ _xtree_draw_node(node->right, arg, depth, icon2);
214
+ }
215
+ free(icon2);
216
+ }
217
+
218
+ void xtree_draw(xtree_t xt)
219
+ {
220
+ int i;
221
+ struct draw_arg arg;
222
+
223
+ if (!xt)
224
+ return;
225
+
226
+ for (i = 0; i < xt->prime; i++)
227
+ {
228
+ arg.depth = 0;
229
+ arg.count = 0;
230
+ arg.flag = 'T';
231
+ _xtree_draw_node(xt->trees[i], &arg, 0, "");
232
+ printf("-----------------------------------------\n");
233
+ printf("Tree [%d] max_depth: %d nodes_num: %d\n", i, arg.depth, arg.count);
234
+ }
235
+ }
236
+ #endif
237
+
238
+ /* optimize the tree */
239
+ static void _xtree_count_nodes(node_t node, int *count)
240
+ {
241
+ if (node == NULL)
242
+ return;
243
+
244
+ *count += 1;
245
+ _xtree_count_nodes(node->left, count);
246
+ _xtree_count_nodes(node->right, count);
247
+ }
248
+
249
+ static void _xtree_load_nodes(node_t node, node_t *nodes, int *count)
250
+ {
251
+ int i = *count;
252
+ if (node == NULL)
253
+ return;
254
+
255
+ nodes[i] = node;
256
+ *count = ++i;
257
+ _xtree_load_nodes(node->left, nodes, count);
258
+ _xtree_load_nodes(node->right, nodes, count);
259
+ }
260
+
261
+ static void _xtree_reset_nodes(node_t *nodes, int low, int high, node_t *curr)
262
+ {
263
+ if (low <= high)
264
+ {
265
+ int mid = (low + high)>>1;
266
+
267
+ *curr = nodes[mid];
268
+ _xtree_reset_nodes(nodes, low, mid-1, &(*curr)->left);
269
+ _xtree_reset_nodes(nodes, mid + 1, high, &(*curr)->right);
270
+ }
271
+ else
272
+ {
273
+ *curr = NULL;
274
+ }
275
+ }
276
+
277
+ #ifdef WIN32
278
+ static int _xtree_node_cmp(node_t *a, node_t *b)
279
+ #else
280
+ static int _xtree_node_cmp(a, b)
281
+ node_t *a, *b;
282
+ #endif
283
+ {
284
+ return strcmp((*a)->key, (*b)->key);
285
+ }
286
+
287
+ void xtree_optimize(xtree_t xt)
288
+ {
289
+ int i, cnt;
290
+ node_t *nodes;
291
+
292
+ if (!xt)
293
+ return;
294
+
295
+ for (i = 0; i < xt->prime; i++)
296
+ {
297
+ cnt = 0;
298
+ _xtree_count_nodes(xt->trees[i], &cnt);
299
+ if (cnt > 2)
300
+ {
301
+ nodes = (node_t *)malloc(sizeof(node_t) * cnt);
302
+ cnt = 0;
303
+ _xtree_load_nodes(xt->trees[i], nodes, &cnt);
304
+ qsort(nodes, cnt, sizeof(node_t), _xtree_node_cmp);
305
+ _xtree_reset_nodes(nodes, 0, cnt - 1, &xt->trees[i]);
306
+ free(nodes);
307
+ }
308
+ }
309
+ }
310
+
311
+ /* convert xtree to xdb file */
312
+ static void _xtree_to_xdb_node(node_t node, xdb_t x)
313
+ {
314
+ if (node == NULL)
315
+ return;
316
+
317
+ xdb_nput(x, node->value, node->vlen, node->key, strlen(node->key));
318
+ _xtree_to_xdb_node(node->left, x);
319
+ _xtree_to_xdb_node(node->right, x);
320
+ }
321
+
322
+ void xtree_to_xdb(xtree_t xt, const char *fpath)
323
+ {
324
+ xdb_t x;
325
+ int i;
326
+
327
+ if (!xt || !(x = xdb_create(fpath, xt->base, xt->prime)))
328
+ return;
329
+
330
+ for (i = 0; i < xt->prime; i++)
331
+ {
332
+ _xtree_to_xdb_node(xt->trees[i], x);
333
+ }
334
+
335
+ xdb_close(x);
336
+ }
337
+
@@ -0,0 +1,65 @@
1
+ /**
2
+ * @file xtree.h
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifndef _SCWS_XTREE_20070525_H_
9
+ #define _SCWS_XTREE_20070525_H_
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ /* pool required */
16
+ #include "pool.h"
17
+
18
+ /* data structure for Hash+Tree */
19
+ typedef struct tree_node node_st, *node_t;
20
+ struct tree_node
21
+ {
22
+ char *key;
23
+ void *value;
24
+ int vlen;
25
+ node_t left;
26
+ node_t right;
27
+ };
28
+
29
+ typedef struct
30
+ {
31
+ pool_t p; /* pool for memory manager */
32
+ int base; /* base number for hasher (prime number recommend) */
33
+ int prime; /* good prime number for hasher */
34
+ int count; /* total nodes */
35
+ node_t *trees; /* trees [total=prime+1] */
36
+ } xtree_st, *xtree_t;
37
+
38
+ /* xtree: api */
39
+ int xtree_hasher(xtree_t xt, const char *key, int len);
40
+ xtree_t xtree_new(int base, int prime); /* create a new hasxtree */
41
+ void xtree_free(xtree_t xt); /* delete & free xthe xtree */
42
+
43
+ void xtree_put(xtree_t xt, const char *value, const char *key);
44
+ void xtree_nput(xtree_t xt, void *value, int vlen, const char *key, int len);
45
+
46
+ void *xtree_get(xtree_t xt, const char *key, int *vlen);
47
+ void *xtree_nget(xtree_t xt, const char *key, int len, int *vlen);
48
+
49
+ /*
50
+ void xtree_del(xtree_t xt, const char *key);
51
+ void xtree_ndel(xtree_t xt, const char *key, int len);
52
+ */
53
+
54
+ #ifdef DEBUG
55
+ void xtree_draw(xtree_t xt);
56
+ #endif
57
+
58
+ void xtree_optimize(xtree_t xt);
59
+ void xtree_to_xdb(xtree_t xt, const char *fpath);
60
+
61
+ #ifdef __cplusplus
62
+ }
63
+ #endif
64
+
65
+ #endif
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Scws4r
4
+ VERSION = "0.1.0"
5
+ end
data/lib/scws4r.rb ADDED
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "scws4r/version"
4
+ require 'scws4r/scws4r'
5
+
6
+ class Scws4r
7
+ def load_defaults
8
+ set_dic File.expand_path('../defaults/dict.utf8.xdb', __dir__), Scws4r::XDICT_XDB | Scws4r::XDICT_MEM
9
+ set_rule File.expand_path('../defaults/rules.utf8.int', __dir__)
10
+ end
11
+
12
+ def split_to_tsvector(text)
13
+ split(text)
14
+ end
15
+ end
data/scws4r.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/scws4r/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "scws4r"
7
+ spec.version = Scws4r::VERSION
8
+ spec.authors = ["xiaohui"]
9
+ spec.email = ["xiaohui@tanmer.com"]
10
+
11
+ spec.summary = "Integrate (SCWS) Simple Chinese Word Segmentation C lib"
12
+ spec.description = "With SCWS to split Chinese sentences"
13
+ spec.homepage = "https://github.com/xiaohui-zhangxh/scws"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 2.5.0"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/xiaohui-zhangxh/scws"
19
+ # spec.metadata["changelog_uri"] = ""
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
24
+ `git ls-files -z`.split("\x0").reject do |f|
25
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
26
+ end
27
+ end
28
+ spec.extensions = %w[ext/scws4r/extconf.rb]
29
+ spec.require_paths = ["lib"]
30
+ end
data/sig/scws.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Scws
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
data/test.rb ADDED
@@ -0,0 +1,16 @@
1
+ require 'scws4r'
2
+ s = Scws4r.new
3
+ s.charset='utf8'
4
+ s.add_dic '/Users/xiaohui/coding/postgresql-zhparser/zhparser/dict.utf8.xdb', Scws4r::XDICT_XDB
5
+ a=s.split(ARGV[0] || 'Hello, 我名字叫李那曲是一个中国人, 我有时买Q币来玩, 我还听说过C#语言')
6
+ puts a
7
+ puts '------------'
8
+ s.multi = Scws4r::MULTI_SHORT | Scws4r::MULTI_DUALITY
9
+ s.multi = Scws4r::MULTI_DUALITY
10
+ s.punctuation_ignore = true
11
+ # s.duality = true
12
+ a=s.split(ARGV[0] || 'Hello, 我名字叫李那曲是一个中国人, 我有时买Q币来玩, 我还听说过C#语言')
13
+ puts a
14
+ puts '------------'
15
+ a=s.tops(ARGV[0] || 'Hello, 我名字叫李那曲是一个中国人, 我有时买Q币来玩, 我还听说过C#语言', 10)
16
+ puts a
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scws4r
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - xiaohui
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-07-26 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: With SCWS to split Chinese sentences
14
+ email:
15
+ - xiaohui@tanmer.com
16
+ executables: []
17
+ extensions:
18
+ - ext/scws4r/extconf.rb
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".rspec"
22
+ - ".rubocop.yml"
23
+ - CHANGELOG.md
24
+ - Gemfile
25
+ - Gemfile.lock
26
+ - LICENSE.txt
27
+ - README.md
28
+ - Rakefile
29
+ - defaults/dict.utf8.xdb
30
+ - defaults/rules.utf8.ini
31
+ - ext/scws4r/Makefile
32
+ - ext/scws4r/Makefile.am
33
+ - ext/scws4r/charset.c
34
+ - ext/scws4r/charset.h
35
+ - ext/scws4r/config_win32.h
36
+ - ext/scws4r/crc32.c
37
+ - ext/scws4r/crc32.h
38
+ - ext/scws4r/darray.c
39
+ - ext/scws4r/darray.h
40
+ - ext/scws4r/extconf.rb
41
+ - ext/scws4r/lock.c
42
+ - ext/scws4r/lock.h
43
+ - ext/scws4r/pool.c
44
+ - ext/scws4r/pool.h
45
+ - ext/scws4r/rule.c
46
+ - ext/scws4r/rule.h
47
+ - ext/scws4r/scws.c
48
+ - ext/scws4r/scws.h
49
+ - ext/scws4r/scws4r.c
50
+ - ext/scws4r/scws4r.h
51
+ - ext/scws4r/version.h.in
52
+ - ext/scws4r/xdb.c
53
+ - ext/scws4r/xdb.h
54
+ - ext/scws4r/xdict.c
55
+ - ext/scws4r/xdict.h
56
+ - ext/scws4r/xtree.c
57
+ - ext/scws4r/xtree.h
58
+ - lib/scws4r.rb
59
+ - lib/scws4r/version.rb
60
+ - scws4r.gemspec
61
+ - sig/scws.rbs
62
+ - test.rb
63
+ homepage: https://github.com/xiaohui-zhangxh/scws
64
+ licenses:
65
+ - MIT
66
+ metadata:
67
+ homepage_uri: https://github.com/xiaohui-zhangxh/scws
68
+ source_code_uri: https://github.com/xiaohui-zhangxh/scws
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: 2.5.0
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubygems_version: 3.3.7
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Integrate (SCWS) Simple Chinese Word Segmentation C lib
88
+ test_files: []