ruby_da 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,358 @@
1
+ // Copyright (C) 2015 Masahiko Higashiyama
2
+ //
3
+ // Permission is hereby granted, free of charge, to any person obtaining a
4
+ // copy of this software and associated documentation files (the "Software"),
5
+ // to deal in the Software without restriction, including without limitation
6
+ // the rights to use, copy, modify, merge, publish, distribute, sublicense,
7
+ // and/or sell copies of the Software, and to permit persons to whom the
8
+ // Software is furnished to do so, subject to the following conditions:
9
+ //
10
+ // The above copyright notice and this permission notice shall be included in
11
+ // all copies or substantial portions of the Software.
12
+ //
13
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ // SOFTWARE.
20
+ #ifndef _DOUBLE_ARRAY_INTERNAL_H_
21
+ #define _DOUBLE_ARRAY_INTERNAL_H_
22
+ #include <vector>
23
+ #include <utility>
24
+ #include <algorithm>
25
+ #include <stdexcept>
26
+ #include <iostream>
27
+ #include <fstream>
28
+
29
+
30
+ class DoubleArrayInternal {
31
+ struct node {
32
+ int base;
33
+ int check;
34
+ };
35
+
36
+ int& empty_head() {
37
+ return array_[0].check;
38
+ }
39
+
40
+ int& entry_num() {
41
+ return array_[0].base;
42
+ }
43
+
44
+ void set_check(int pos, int base, bool overwrite = false){
45
+ if(overwrite && array_[pos].check > 0){
46
+ array_[pos].check = base;
47
+ }else if(pos == empty_head()){
48
+ empty_head() = -array_[pos].check;
49
+ array_[pos].check = base;
50
+ }else{
51
+ int i = empty_head();
52
+ int n = array_.size();
53
+ while(i < n){
54
+ if(pos == -array_[i].check) break;
55
+ i = -array_[i].check;
56
+ }
57
+ if(i >= n) throw std::runtime_error("failed set check");
58
+ array_[i].check = array_[pos].check;
59
+ array_[pos].check = base;
60
+ }
61
+ }
62
+
63
+ void delete_check(int pos){
64
+ if(pos < empty_head()){
65
+ array_[pos].check = -empty_head();
66
+ empty_head() = pos;
67
+ }else{
68
+ int i = empty_head();
69
+ int n = array_.size();
70
+ while(i < n){
71
+ if(i < pos && pos < -array_[i].check) break;
72
+ i = -array_[i].check;
73
+ }
74
+ if(i >= n) throw std::runtime_error("failed delete check");
75
+ array_[pos].check = array_[i].check;
76
+ array_[i].check = -pos;
77
+ }
78
+ }
79
+
80
+ void expand(size_t pos){
81
+ if(pos < array_.size()) return;
82
+ size_t i = array_.size();
83
+ size_t n = array_.capacity();
84
+ while(pos > n) n <<= 1;
85
+ array_.reserve(n);
86
+ array_.resize(pos+1);
87
+ for(; i <= pos; i++){
88
+ array_[i].check = -(i+1);
89
+ }
90
+ }
91
+
92
+ std::pair<int,const char *> fetch(const char *str) const {
93
+ const char *p = str;
94
+ int state = 1;
95
+ while(1){
96
+ unsigned char c = *p;
97
+ int t = array_[state].base + c;
98
+ //printf("%d %d %d %c %d\n",
99
+ //state, t, array_[state].base, c, array_[t].check);
100
+ if(t < static_cast<int>(array_.size()) && array_[t].check == state){
101
+ if(c == 0) return std::make_pair(state, p);
102
+ state = t;
103
+ ++p;
104
+ }else{
105
+ return std::make_pair(-state, p);
106
+ }
107
+ }
108
+ }
109
+
110
+ void get_labels(int index, int base, std::vector<unsigned char> &labels) const {
111
+ size_t maxlen = std::min(static_cast<int>(array_.size()), base + 256);
112
+ for(size_t i = base; i < maxlen; i++){
113
+ if(array_[i].check == index){
114
+ labels.push_back(i - base);
115
+ }
116
+ }
117
+ }
118
+
119
+ int find_base(const std::vector<unsigned char> &codes, unsigned char c){
120
+ int base_cand;
121
+ int empty_index = empty_head();
122
+ while(1){
123
+ expand(empty_index);
124
+ base_cand = empty_index - c;
125
+ if(base_cand <= 1){
126
+ empty_index = -array_[empty_index].check;
127
+ continue;
128
+ }
129
+ bool found = true;
130
+ for(size_t i = 0; i < codes.size(); i++){
131
+ expand(base_cand + codes[i]);
132
+ if(array_[base_cand + codes[i]].check > 0){
133
+ found = false;
134
+ break;
135
+ }
136
+ }
137
+ if(found) break;
138
+ empty_index = -array_[empty_index].check;
139
+ }
140
+ return base_cand;
141
+ }
142
+
143
+ void move_to(int from, int from_base, int to){
144
+ //printf("copy base[%d] = from %d\n", to, from_base);
145
+ array_[to].base = from_base;
146
+ if(from_base > 0){
147
+ std::vector<unsigned char> trans;
148
+ get_labels(from, from_base, trans);
149
+ for(size_t j = 0; j < trans.size(); j++){
150
+ //printf("move from check[%d](%c) = %d\n",
151
+ //from_base + trans[j], trans[j], to);
152
+ set_check(from_base + trans[j], to, true);
153
+ }
154
+ }
155
+ //printf("init from address %d %d %d\n",
156
+ //from, array_[from].base, from_base);
157
+ array_[from].base = 0;
158
+ delete_check(from);
159
+ }
160
+
161
+ void _insert(const char *str, int base, int id){
162
+ int pos = array_[base].base + static_cast<unsigned char>(*str);
163
+ expand(std::max(base,pos));
164
+ if(array_[base].base == 0 || array_[pos].check >= 0){ //conflict
165
+ int oldbase = array_[base].base;
166
+ std::vector<unsigned char> codes;
167
+ if(oldbase > 0) get_labels(base, oldbase, codes);
168
+ int base_cand = find_base(codes, *str);
169
+ //printf("set base base[%d] = %d\n", base, base_cand);
170
+ array_[base].base = base_cand;
171
+ std::vector<int> from,from_base;
172
+ for(size_t i = 0; i < codes.size(); i++){
173
+ int old_t = oldbase + codes[i];
174
+ from.push_back(old_t);
175
+ from_base.push_back(array_[old_t].base);
176
+ //printf("move check check[%d](%c) = %d\n",
177
+ //base_cand + codes[i], codes[i], base);
178
+ set_check(base_cand + codes[i], base);
179
+ }
180
+ for(size_t i = 0; i < from.size(); i++){
181
+ move_to(from[i], from_base[i], base_cand + codes[i]);
182
+ }
183
+ pos = base_cand + static_cast<unsigned char>(*str);
184
+ }
185
+ //printf("set check check[%d](%c) = %d\n", pos, *str, base);
186
+ set_check(pos, base);
187
+ if(*str != '\0'){
188
+ _insert(str+1, pos, id);
189
+ }else{
190
+ if(id < 1){
191
+ array_[pos].base = -(entry_num() + 1);
192
+ }else{
193
+ array_[pos].base = -id;
194
+ }
195
+ entry_num() += 1;
196
+ }
197
+ }
198
+
199
+ void _erase(const char *str, int index, const char *p){
200
+ int newbase = array_[index].check;
201
+ delete_check(array_[index].base+*p);
202
+ std::vector<unsigned char> labels;
203
+ get_labels(index, array_[index].base, labels);
204
+ if(labels.size() == 0 && str != p)
205
+ _erase(str, newbase, --p);
206
+ }
207
+
208
+ /* For DEBUG */
209
+ void print_array() const {
210
+ std::cout << "[";
211
+ for(size_t i = 0; i < array_.size(); i++){
212
+ std::cout << "element " << i << ":" << array_[i].base << ":" << array_[i].check << std::endl;
213
+ }
214
+ std::cout << "]" << std::endl;
215
+ }
216
+
217
+ void _enumerate(int current_idx,
218
+ std::vector<unsigned char> &path,
219
+ std::vector<std::pair<std::string, int> > &result) const {
220
+ std::vector<unsigned char> labels;
221
+ get_labels(current_idx, array_[current_idx].base, labels);
222
+ for(std::vector<unsigned char>::iterator itr = labels.begin(); itr != labels.end(); ++itr){
223
+ int new_idx = array_[current_idx].base + *itr;
224
+ if(*itr == '\0'){
225
+ std::string res;
226
+ res.reserve(path.size());
227
+ for(std::vector<unsigned char>::iterator jtr = path.begin(); jtr != path.end(); ++jtr){
228
+ res += static_cast<char>(*jtr);
229
+ }
230
+ result.push_back(std::make_pair(res, -array_[new_idx].base));
231
+ }else{
232
+ path.push_back(*itr);
233
+ _enumerate(new_idx, path, result);
234
+ path.pop_back();
235
+ }
236
+ }
237
+ }
238
+
239
+ public:
240
+
241
+ int exact_match(const char *str) const {
242
+ std::pair<int, const char*> state = fetch(str);
243
+ if(state.first > 0){
244
+ int t = array_[state.first].base + static_cast<unsigned char>(*state.second);
245
+ return -array_[t].base;
246
+ }
247
+ return -1;
248
+ }
249
+
250
+ void enumerate(const char *str, std::vector<std::pair<std::string, int> > &result) const {
251
+ int index = 1;
252
+ if(str[0] != '\0'){
253
+ std::pair<int, const char*> state = fetch(str);
254
+ if(state.first > 0){
255
+ index = state.first;
256
+ }else{
257
+ index = -state.first;
258
+ }
259
+ }
260
+ std::vector<unsigned char> path;
261
+ for(const char *p = str; *p != '\0'; p++){
262
+ path.push_back(static_cast<unsigned int>(*p));
263
+ }
264
+ _enumerate(index, path, result);
265
+ }
266
+
267
+ void common_prefix_search(const char *str,
268
+ std::vector<int> &res_len,
269
+ std::vector<int> &res_id) const {
270
+ const char *p = str;
271
+ int state = 1;
272
+ while(1){
273
+ int t = array_[state].base;
274
+ if(state != 1 && t < static_cast<int>(array_.size()) && array_[t].check == state){
275
+ res_len.push_back(std::distance(str,p));
276
+ res_id.push_back(-array_[t].base);
277
+ }
278
+ unsigned char c = *p;
279
+ if(t+c < static_cast<int>(array_.size()) && array_[t+c].check == state){
280
+ if(c == 0) return;
281
+ state = t+c;
282
+ ++p;
283
+ }else{
284
+ return;
285
+ }
286
+ }
287
+ }
288
+
289
+ bool insert(const char *str, int id = -1){
290
+ std::pair<int,const char*> state = fetch(str);
291
+ if(state.first > 0){
292
+ //printf("insert failed %s, Found.", str);
293
+ return false;
294
+ }
295
+ //printf("insert %s", str);
296
+ _insert(state.second, -state.first, id);
297
+ return true;
298
+ }
299
+
300
+
301
+ bool erase(const char *str){
302
+ std::pair<int,const char*> state = fetch(str);
303
+ if(state.first < 0){
304
+ //printf("erase failed %s, Not Found.", str);
305
+ return false;
306
+ }
307
+ //printf("erase %s", str);
308
+ _erase(str, state.first, state.second);
309
+
310
+ return true;
311
+ }
312
+
313
+ DoubleArrayInternal() : array_(2) {
314
+ entry_num() = 0;
315
+ empty_head() = 1;
316
+ array_[1].check = -2;
317
+ expand(8192);
318
+ }
319
+
320
+ bool save(const char *filename) const {
321
+ std::ofstream ofs(filename, std::ios::binary);
322
+ if(!ofs){
323
+ return false;
324
+ }
325
+ return save(ofs);
326
+ }
327
+
328
+ bool save(std::ostream &os) const {
329
+ os.write(reinterpret_cast<const char *>(&array_[0]),
330
+ array_.size()*sizeof(node));
331
+ if(os.fail()) return false;
332
+ return true;
333
+ }
334
+
335
+ bool load(const char *filename) {
336
+ std::ifstream ifs(filename, std::ios::binary);
337
+ if(!ifs){
338
+ return false;
339
+ }
340
+ return load(ifs);
341
+ }
342
+
343
+ bool load(std::istream &is){
344
+ size_t siz_ = is.seekg(0,std::ios::end).tellg();
345
+ if(is.fail()) return false;
346
+ array_.resize(siz_ / sizeof(node));
347
+ is.seekg(0, std::ios::beg);
348
+ if(is.fail()) return false;
349
+ is.read(reinterpret_cast<char *>(&array_[0]), array_.size()*sizeof(node));
350
+ if(is.fail() && !is.eof()) return false;
351
+ return true;
352
+ }
353
+
354
+ private:
355
+ std::vector<node> array_;
356
+ };
357
+
358
+ #endif /* _DOUBLE_ARRAY_INTERNAL_H_ */
@@ -0,0 +1,117 @@
1
+ // Copyright (C) 2015 Masahiko Higashiyama
2
+ //
3
+ // Permission is hereby granted, free of charge, to any person obtaining a
4
+ // copy of this software and associated documentation files (the "Software"),
5
+ // to deal in the Software without restriction, including without limitation
6
+ // the rights to use, copy, modify, merge, publish, distribute, sublicense,
7
+ // and/or sell copies of the Software, and to permit persons to whom the
8
+ // Software is furnished to do so, subject to the following conditions:
9
+ //
10
+ // The above copyright notice and this permission notice shall be included in
11
+ // all copies or substantial portions of the Software.
12
+ //
13
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ // SOFTWARE.
20
+
21
+ #ifndef UTF8_UTILITY_HPP
22
+ #define UTF8_UTILITY_HPP
23
+
24
+ #include <cstring>
25
+ #include <exception>
26
+ #include <vector>
27
+
28
+ class UTF8Exception : public std::exception {
29
+ public:
30
+ UTF8Exception() {}
31
+ virtual ~UTF8Exception() throw() {}
32
+ };
33
+
34
+ int utf8charlen(const unsigned char c)
35
+ {
36
+ if(c == 0x00) return 0;
37
+ if(c < 0x80) return 1;
38
+ if(c < 0xC2) throw UTF8Exception();
39
+ if(c < 0xE0) return 2;
40
+ if(c < 0xF0) return 3;
41
+ if(c < 0xF8) return 4;
42
+ if(c < 0xFC) return 5;
43
+ if(c < 0xFE) return 6;
44
+ return 1;
45
+ }
46
+
47
+ char *utf8substr(const char *s, int len){
48
+ int n = 0, size = 0;
49
+ const char *p = s;
50
+ int l;
51
+ while((l = utf8charlen(*p)) && n != len){
52
+ p += l;
53
+ size += l;
54
+ n++;
55
+ }
56
+ if(l == 0) size++;
57
+ char *str = new char[size + 1];
58
+ strncpy(str, s, size);
59
+ str[size] = '\0';
60
+
61
+ return str;
62
+ }
63
+
64
+ int utf8len(const char *s, int len){
65
+ const char *p = s;
66
+ int n = 0, nc = 0;
67
+ while(*p != '\0' && n < len){
68
+ int k = utf8charlen(*p);
69
+ n += k;
70
+ nc++;
71
+ p += k;
72
+ }
73
+ if(n > len) return -1;
74
+ return nc;
75
+ }
76
+
77
+ const char *utf8nextchar(const char *s){
78
+ return s + utf8charlen(*s);
79
+ }
80
+
81
+ std::vector<const char *> utf8index(const char *s){
82
+ std::vector<const char *> v;
83
+ const char *p = s;
84
+ while(*p != '\0'){
85
+ v.push_back(p);
86
+ p = utf8nextchar(p);
87
+ }
88
+ return v;
89
+ }
90
+
91
+ const char *utf8advance(const char *s, unsigned int len){
92
+ size_t l = 0;
93
+ const char *p = s;
94
+ while(*p != '\0' && l < len ){
95
+ p = utf8nextchar(p);
96
+ l++;
97
+ }
98
+ return p;
99
+ }
100
+
101
+ // // bi-gram extraction example
102
+ // #include <iostream>
103
+ // using namespace std;
104
+ //
105
+ // int main(int argc, char *argv[])
106
+ // {
107
+ // const char *p = "大きなノッポの古時計";
108
+ // while(*p != '\0'){
109
+ // char *sub = utf8substr(p, 2);
110
+ // cout << sub << " ";
111
+ // delete[] sub;
112
+ // p = utf8nextchar(p);
113
+ // }
114
+ // cout << endl;
115
+ // return 0;
116
+ // }
117
+ #endif /* UTF8_UTILITY_HPP */
@@ -0,0 +1,114 @@
1
+ /*
2
+ * WRITTEN BY Masahiko Higashiyama in 2010.
3
+ *
4
+ * THIS CODE IS IN PUBLIC DOMAIN.
5
+ * THIS SOFTWARE IS COMPLETELY FREE TO COPY, MODIFY AND/OR RE-DISTRIBUTE.
6
+ *
7
+ * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
8
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
9
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
10
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
11
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
12
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
13
+ * SOFTWARE.
14
+ */
15
+
16
+ #include <fstream>
17
+ #include <string>
18
+ #include "../include/double_array.hpp"
19
+ #include "profile.hpp"
20
+ using namespace std;
21
+
22
+
23
+ void print(const DoubleArray &da, const char *str){
24
+ vector<int> v1;
25
+ vector<int> v2;
26
+ cout << "searching " << str << endl;
27
+ da.common_prefix_search(str,v1,v2);
28
+ if(v1.size() == 0) cout << "Not Found" << endl;
29
+ for(size_t i = 0; i < v1.size(); i++)
30
+ cout << v1[i] << " " << v2[i] << endl;
31
+ }
32
+
33
+ void read(vector<string> &v){
34
+ string line;
35
+ while(getline(cin,line)){
36
+ v.push_back(line);
37
+ }
38
+ stable_sort(v.begin(),v.end());
39
+ }
40
+
41
+ void build(DoubleArray &da, const vector<string> &v){
42
+ double t1,t2;
43
+ t1 = GetusageSec();
44
+ for(size_t i = 0; i < v.size(); i++){
45
+ da.insert(v[i].c_str());
46
+ //cout << i << " " << v[i] << endl;
47
+ }
48
+ t2 = GetusageSec();
49
+ PrintTime(t1,t2);
50
+ }
51
+
52
+
53
+ void main2(int argc, char *argv[], DoubleArray &da){
54
+
55
+ if(argc < 1 || argv[1] == NULL){
56
+ print(da,"bisons");
57
+ da.erase("bisons");
58
+ cout << "erase bisons" << endl;
59
+ da.erase("bison");
60
+ da.erase("bison");
61
+ cout << "erase bison" << endl;
62
+ print(da,"bisons");
63
+ print(da,"bison");
64
+ da.insert("bisons");
65
+ print(da,"bisons");
66
+ print(da,"bison");
67
+ da.insert("bison");
68
+ print(da,"bison");
69
+ da.erase("ARPANET");
70
+ cout << "erase ARPANET" << endl;
71
+ print(da,"ARPA");
72
+ print(da,"ARPANET");
73
+ }else{
74
+ print(da,argv[1]);
75
+ }
76
+ }
77
+
78
+ void main3(int argc, char *argv[], DoubleArray &da, vector<string> &v){
79
+ double t1,t2;
80
+ t1 = GetusageSec();
81
+ for(size_t i = 0; i < v.size(); i++){
82
+ int a;
83
+ if((a = da.exact_match(v[i].c_str())) == -1){
84
+ cout << "error " << v[i] << " " << a << endl;
85
+ }else{
86
+ //cout << v[i] << " " << a << endl;
87
+ }
88
+ }
89
+ t2 = GetusageSec();
90
+ PrintTime(t1,t2);
91
+ }
92
+
93
+ int main(int argc, char *argv[]){
94
+
95
+ vector<string> lines;
96
+ read(lines);
97
+ DoubleArray da;
98
+ build(da,lines);
99
+ /*
100
+ std::vector<std::pair<std::string, int> > result;
101
+ da.enumerate("", result);
102
+ for(size_t i = 0; i < result.size(); i++){
103
+ std::cout << result[i].first << " " << result[i].second << std::endl;
104
+ }
105
+ */
106
+ std::vector<std::string> res;
107
+ da.extract_all_matched("zoo", res);
108
+ for(size_t i = 0; i < res.size(); i++){
109
+ std::cout << res[i] << std::endl;
110
+ }
111
+
112
+
113
+ return 0;
114
+ }