scws4r 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,337 @@
1
+ /**
2
+ * @file xtree.c
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifdef HAVE_CONFIG_H
9
+ # include "config.h"
10
+ #endif
11
+
12
+ #include "xtree.h"
13
+ #include "xdb.h"
14
+ #include <stdio.h>
15
+ #include <stdlib.h>
16
+ #ifndef WIN32
17
+ # include <unistd.h>
18
+ #endif
19
+ #include <string.h>
20
+
21
+ /* private static functions */
22
+ static int _xtree_hasher(xtree_t xt, const char *s, int len)
23
+ {
24
+ unsigned int h = xt->base;
25
+ while (len--)
26
+ {
27
+ h += (h<<5);
28
+ h ^= (unsigned char) s[len];
29
+ h &= 0x7fffffff;
30
+ }
31
+ return (h % xt->prime);
32
+ }
33
+
34
+ static node_t _xtree_node_search(node_t head, node_t **pnode, const char *key, int len)
35
+ {
36
+ int cmp;
37
+
38
+ cmp = memcmp(key, head->key, len);
39
+ if (cmp == 0)
40
+ cmp = len - strlen(head->key);
41
+
42
+ if (cmp != 0)
43
+ {
44
+ node_t *next;
45
+
46
+ next = (cmp > 0 ? &head->right : &head->left);
47
+ if (*next == NULL)
48
+ {
49
+ if (pnode != NULL)
50
+ *pnode = next;
51
+ return NULL;
52
+ }
53
+ return _xtree_node_search(*next, pnode, key, len);
54
+ }
55
+ return head;
56
+ }
57
+
58
+ static node_t _xtree_node_find(xtree_t xt, node_t **pnode, const char *key, int len)
59
+ {
60
+ int i;
61
+ i = (xt->prime > 1 ? _xtree_hasher(xt, key, len) : 0);
62
+ if (xt->trees[i] == NULL)
63
+ {
64
+ if (pnode != NULL)
65
+ *pnode = &xt->trees[i];
66
+ return NULL;
67
+ }
68
+ return _xtree_node_search(xt->trees[i], pnode, key, len);
69
+ }
70
+
71
+ /* public functions */
72
+ xtree_t xtree_new(int base, int prime)
73
+ {
74
+ xtree_t xnew;
75
+ pool_t p;
76
+
77
+ p = pool_new();
78
+ xnew = pmalloc(p, sizeof(xtree_st));
79
+ xnew->p = p;
80
+ xnew->base = (base ? base : 0xf422f);
81
+ xnew->prime = (prime ? prime : 31);
82
+ xnew->count = 0;
83
+ xnew->trees = (node_t *) pmalloc_z(p, sizeof(node_t) * xnew->prime);
84
+ return xnew;
85
+ }
86
+
87
+ void xtree_free(xtree_t xt)
88
+ {
89
+ if (xt)
90
+ pool_free(xt->p);
91
+ }
92
+
93
+ void xtree_nput(xtree_t xt, void *value, int vlen, const char *key, int len)
94
+ {
95
+ node_t node, *pnode;
96
+
97
+ if (xt == NULL || key == NULL || len == 0)
98
+ return;
99
+
100
+ if ((node = _xtree_node_find(xt, &pnode, key, len)) != NULL)
101
+ {
102
+ node->value = value;
103
+ node->vlen = vlen;
104
+ return;
105
+ }
106
+
107
+ if (value != NULL)
108
+ {
109
+ *pnode = node = (node_t) pmalloc(xt->p, sizeof(node_st));
110
+ node->key = pstrndup(xt->p, key, len);
111
+ node->value = value;
112
+ node->vlen = vlen;
113
+ node->left = NULL;
114
+ node->right = NULL;
115
+ }
116
+ }
117
+
118
+ void xtree_put(xtree_t xt, const char *value, const char *key)
119
+ {
120
+ if (xt != NULL && key != NULL)
121
+ xtree_nput(xt, (void *) value, value ? strlen(value) : 0, key, strlen(key));
122
+ }
123
+
124
+ void *xtree_nget(xtree_t xt, const char *key, int len, int *vlen)
125
+ {
126
+ node_t node;
127
+
128
+ if (xt == NULL || key == NULL || len == 0
129
+ || !(node = _xtree_node_find(xt, NULL, key, len)))
130
+ {
131
+ return NULL;
132
+ }
133
+
134
+ if (vlen != NULL)
135
+ *vlen = node->vlen;
136
+ return node->value;
137
+ }
138
+
139
+ void *xtree_get(xtree_t xt, const char *key, int *vlen)
140
+ {
141
+ if (xt == NULL || key == NULL)
142
+ return NULL;
143
+
144
+ return xtree_nget(xt, key, strlen(key), vlen);
145
+ }
146
+
147
+ /*
148
+ void xtree_ndel(xtree_t xt, const char *key, int len)
149
+ {
150
+ xtree_nput(xt, NULL, 0, key, len);
151
+ }
152
+
153
+ void xtree_del(xtree_t xt, const char *key)
154
+ {
155
+ if (xt == NULL || key == NULL)
156
+ return;
157
+
158
+ xtree_ndel(xt, key, strlen(key));
159
+ }
160
+ */
161
+
162
+ #ifdef DEBUG
163
+ /* draw the xtree to stdout */
164
+ struct draw_arg
165
+ {
166
+ int depth;
167
+ int count;
168
+ int flag;
169
+ };
170
+
171
+ static void _xtree_draw_node(node_t node, struct draw_arg *arg, int depth, char *icon1)
172
+ {
173
+ char *icon2;
174
+
175
+ icon2 = malloc(strlen(icon1) + 4);
176
+ strcpy(icon2, icon1);
177
+
178
+ // output the flag & icon
179
+ if (arg->flag == 'T')
180
+ printf("(T) ");
181
+ else
182
+ {
183
+ printf("%s", icon2);
184
+ if (arg->flag == 'L')
185
+ {
186
+ strcat(icon2, " ┃");
187
+ printf(" ┟(L) ");
188
+ }
189
+ else
190
+ {
191
+ strcat(icon2, "  ");
192
+ printf(" └(R) ");
193
+ }
194
+ }
195
+
196
+ // draw the node data
197
+ if (node == NULL)
198
+ printf("<NULL>\n");
199
+ else
200
+ {
201
+ printf("%s (value on 0x%x vlen=%d)\n", node->key, (unsigned int)node->value, node->vlen);
202
+
203
+ arg->count++;
204
+ depth++;
205
+ if (depth > arg->depth)
206
+ arg->depth = depth;
207
+
208
+ // draw the left & right
209
+ arg->flag = 'L';
210
+ _xtree_draw_node(node->left, arg, depth, icon2);
211
+
212
+ arg->flag = 'R';
213
+ _xtree_draw_node(node->right, arg, depth, icon2);
214
+ }
215
+ free(icon2);
216
+ }
217
+
218
+ void xtree_draw(xtree_t xt)
219
+ {
220
+ int i;
221
+ struct draw_arg arg;
222
+
223
+ if (!xt)
224
+ return;
225
+
226
+ for (i = 0; i < xt->prime; i++)
227
+ {
228
+ arg.depth = 0;
229
+ arg.count = 0;
230
+ arg.flag = 'T';
231
+ _xtree_draw_node(xt->trees[i], &arg, 0, "");
232
+ printf("-----------------------------------------\n");
233
+ printf("Tree [%d] max_depth: %d nodes_num: %d\n", i, arg.depth, arg.count);
234
+ }
235
+ }
236
+ #endif
237
+
238
+ /* optimize the tree */
239
+ static void _xtree_count_nodes(node_t node, int *count)
240
+ {
241
+ if (node == NULL)
242
+ return;
243
+
244
+ *count += 1;
245
+ _xtree_count_nodes(node->left, count);
246
+ _xtree_count_nodes(node->right, count);
247
+ }
248
+
249
+ static void _xtree_load_nodes(node_t node, node_t *nodes, int *count)
250
+ {
251
+ int i = *count;
252
+ if (node == NULL)
253
+ return;
254
+
255
+ nodes[i] = node;
256
+ *count = ++i;
257
+ _xtree_load_nodes(node->left, nodes, count);
258
+ _xtree_load_nodes(node->right, nodes, count);
259
+ }
260
+
261
+ static void _xtree_reset_nodes(node_t *nodes, int low, int high, node_t *curr)
262
+ {
263
+ if (low <= high)
264
+ {
265
+ int mid = (low + high)>>1;
266
+
267
+ *curr = nodes[mid];
268
+ _xtree_reset_nodes(nodes, low, mid-1, &(*curr)->left);
269
+ _xtree_reset_nodes(nodes, mid + 1, high, &(*curr)->right);
270
+ }
271
+ else
272
+ {
273
+ *curr = NULL;
274
+ }
275
+ }
276
+
277
+ #ifdef WIN32
278
+ static int _xtree_node_cmp(node_t *a, node_t *b)
279
+ #else
280
+ static int _xtree_node_cmp(a, b)
281
+ node_t *a, *b;
282
+ #endif
283
+ {
284
+ return strcmp((*a)->key, (*b)->key);
285
+ }
286
+
287
+ void xtree_optimize(xtree_t xt)
288
+ {
289
+ int i, cnt;
290
+ node_t *nodes;
291
+
292
+ if (!xt)
293
+ return;
294
+
295
+ for (i = 0; i < xt->prime; i++)
296
+ {
297
+ cnt = 0;
298
+ _xtree_count_nodes(xt->trees[i], &cnt);
299
+ if (cnt > 2)
300
+ {
301
+ nodes = (node_t *)malloc(sizeof(node_t) * cnt);
302
+ cnt = 0;
303
+ _xtree_load_nodes(xt->trees[i], nodes, &cnt);
304
+ qsort(nodes, cnt, sizeof(node_t), _xtree_node_cmp);
305
+ _xtree_reset_nodes(nodes, 0, cnt - 1, &xt->trees[i]);
306
+ free(nodes);
307
+ }
308
+ }
309
+ }
310
+
311
+ /* convert xtree to xdb file */
312
+ static void _xtree_to_xdb_node(node_t node, xdb_t x)
313
+ {
314
+ if (node == NULL)
315
+ return;
316
+
317
+ xdb_nput(x, node->value, node->vlen, node->key, strlen(node->key));
318
+ _xtree_to_xdb_node(node->left, x);
319
+ _xtree_to_xdb_node(node->right, x);
320
+ }
321
+
322
+ void xtree_to_xdb(xtree_t xt, const char *fpath)
323
+ {
324
+ xdb_t x;
325
+ int i;
326
+
327
+ if (!xt || !(x = xdb_create(fpath, xt->base, xt->prime)))
328
+ return;
329
+
330
+ for (i = 0; i < xt->prime; i++)
331
+ {
332
+ _xtree_to_xdb_node(xt->trees[i], x);
333
+ }
334
+
335
+ xdb_close(x);
336
+ }
337
+
@@ -0,0 +1,65 @@
1
+ /**
2
+ * @file xtree.h
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifndef _SCWS_XTREE_20070525_H_
9
+ #define _SCWS_XTREE_20070525_H_
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ /* pool required */
16
+ #include "pool.h"
17
+
18
+ /* data structure for Hash+Tree */
19
+ typedef struct tree_node node_st, *node_t;
20
+ struct tree_node
21
+ {
22
+ char *key;
23
+ void *value;
24
+ int vlen;
25
+ node_t left;
26
+ node_t right;
27
+ };
28
+
29
+ typedef struct
30
+ {
31
+ pool_t p; /* pool for memory manager */
32
+ int base; /* base number for hasher (prime number recommend) */
33
+ int prime; /* good prime number for hasher */
34
+ int count; /* total nodes */
35
+ node_t *trees; /* trees [total=prime+1] */
36
+ } xtree_st, *xtree_t;
37
+
38
+ /* xtree: api */
39
+ int xtree_hasher(xtree_t xt, const char *key, int len);
40
+ xtree_t xtree_new(int base, int prime); /* create a new hasxtree */
41
+ void xtree_free(xtree_t xt); /* delete & free xthe xtree */
42
+
43
+ void xtree_put(xtree_t xt, const char *value, const char *key);
44
+ void xtree_nput(xtree_t xt, void *value, int vlen, const char *key, int len);
45
+
46
+ void *xtree_get(xtree_t xt, const char *key, int *vlen);
47
+ void *xtree_nget(xtree_t xt, const char *key, int len, int *vlen);
48
+
49
+ /*
50
+ void xtree_del(xtree_t xt, const char *key);
51
+ void xtree_ndel(xtree_t xt, const char *key, int len);
52
+ */
53
+
54
+ #ifdef DEBUG
55
+ void xtree_draw(xtree_t xt);
56
+ #endif
57
+
58
+ void xtree_optimize(xtree_t xt);
59
+ void xtree_to_xdb(xtree_t xt, const char *fpath);
60
+
61
+ #ifdef __cplusplus
62
+ }
63
+ #endif
64
+
65
+ #endif
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Scws4r
4
+ VERSION = "0.1.0"
5
+ end
data/lib/scws4r.rb ADDED
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "scws4r/version"
4
+ require 'scws4r/scws4r'
5
+
6
+ class Scws4r
7
+ def load_defaults
8
+ set_dic File.expand_path('../defaults/dict.utf8.xdb', __dir__), Scws4r::XDICT_XDB | Scws4r::XDICT_MEM
9
+ set_rule File.expand_path('../defaults/rules.utf8.int', __dir__)
10
+ end
11
+
12
+ def split_to_tsvector(text)
13
+ split(text)
14
+ end
15
+ end
data/scws4r.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/scws4r/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "scws4r"
7
+ spec.version = Scws4r::VERSION
8
+ spec.authors = ["xiaohui"]
9
+ spec.email = ["xiaohui@tanmer.com"]
10
+
11
+ spec.summary = "Integrate (SCWS) Simple Chinese Word Segmentation C lib"
12
+ spec.description = "With SCWS to split Chinese sentences"
13
+ spec.homepage = "https://github.com/xiaohui-zhangxh/scws"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 2.5.0"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/xiaohui-zhangxh/scws"
19
+ # spec.metadata["changelog_uri"] = ""
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
24
+ `git ls-files -z`.split("\x0").reject do |f|
25
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
26
+ end
27
+ end
28
+ spec.extensions = %w[ext/scws4r/extconf.rb]
29
+ spec.require_paths = ["lib"]
30
+ end
data/sig/scws.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Scws
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
data/test.rb ADDED
@@ -0,0 +1,16 @@
1
+ require 'scws4r'
2
+ s = Scws4r.new
3
+ s.charset='utf8'
4
+ s.add_dic '/Users/xiaohui/coding/postgresql-zhparser/zhparser/dict.utf8.xdb', Scws4r::XDICT_XDB
5
+ a=s.split(ARGV[0] || 'Hello, 我名字叫李那曲是一个中国人, 我有时买Q币来玩, 我还听说过C#语言')
6
+ puts a
7
+ puts '------------'
8
+ s.multi = Scws4r::MULTI_SHORT | Scws4r::MULTI_DUALITY
9
+ s.multi = Scws4r::MULTI_DUALITY
10
+ s.punctuation_ignore = true
11
+ # s.duality = true
12
+ a=s.split(ARGV[0] || 'Hello, 我名字叫李那曲是一个中国人, 我有时买Q币来玩, 我还听说过C#语言')
13
+ puts a
14
+ puts '------------'
15
+ a=s.tops(ARGV[0] || 'Hello, 我名字叫李那曲是一个中国人, 我有时买Q币来玩, 我还听说过C#语言', 10)
16
+ puts a
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scws4r
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - xiaohui
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-07-26 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: With SCWS to split Chinese sentences
14
+ email:
15
+ - xiaohui@tanmer.com
16
+ executables: []
17
+ extensions:
18
+ - ext/scws4r/extconf.rb
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".rspec"
22
+ - ".rubocop.yml"
23
+ - CHANGELOG.md
24
+ - Gemfile
25
+ - Gemfile.lock
26
+ - LICENSE.txt
27
+ - README.md
28
+ - Rakefile
29
+ - defaults/dict.utf8.xdb
30
+ - defaults/rules.utf8.ini
31
+ - ext/scws4r/Makefile
32
+ - ext/scws4r/Makefile.am
33
+ - ext/scws4r/charset.c
34
+ - ext/scws4r/charset.h
35
+ - ext/scws4r/config_win32.h
36
+ - ext/scws4r/crc32.c
37
+ - ext/scws4r/crc32.h
38
+ - ext/scws4r/darray.c
39
+ - ext/scws4r/darray.h
40
+ - ext/scws4r/extconf.rb
41
+ - ext/scws4r/lock.c
42
+ - ext/scws4r/lock.h
43
+ - ext/scws4r/pool.c
44
+ - ext/scws4r/pool.h
45
+ - ext/scws4r/rule.c
46
+ - ext/scws4r/rule.h
47
+ - ext/scws4r/scws.c
48
+ - ext/scws4r/scws.h
49
+ - ext/scws4r/scws4r.c
50
+ - ext/scws4r/scws4r.h
51
+ - ext/scws4r/version.h.in
52
+ - ext/scws4r/xdb.c
53
+ - ext/scws4r/xdb.h
54
+ - ext/scws4r/xdict.c
55
+ - ext/scws4r/xdict.h
56
+ - ext/scws4r/xtree.c
57
+ - ext/scws4r/xtree.h
58
+ - lib/scws4r.rb
59
+ - lib/scws4r/version.rb
60
+ - scws4r.gemspec
61
+ - sig/scws.rbs
62
+ - test.rb
63
+ homepage: https://github.com/xiaohui-zhangxh/scws
64
+ licenses:
65
+ - MIT
66
+ metadata:
67
+ homepage_uri: https://github.com/xiaohui-zhangxh/scws
68
+ source_code_uri: https://github.com/xiaohui-zhangxh/scws
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: 2.5.0
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubygems_version: 3.3.7
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Integrate (SCWS) Simple Chinese Word Segmentation C lib
88
+ test_files: []