tx 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +1 -0
- data/ext/Makefile +163 -0
- data/ext/depend +6 -0
- data/ext/extconf.rb +15 -0
- data/ext/ssv.cpp +355 -0
- data/ext/ssv.hpp +93 -0
- data/ext/swig.patch +192 -0
- data/ext/tx.cpp +442 -0
- data/ext/tx.hpp +62 -0
- data/ext/tx_swig.cpp +164 -0
- data/ext/tx_swig.h +93 -0
- data/ext/tx_swig.i +17 -0
- data/ext/tx_swig_wrap.cxx +9884 -0
- data/lib/i386-msvcrt/tx_core.so +0 -0
- data/lib/tx.rb +219 -0
- data/test/test_tx.rb +169 -0
- metadata +86 -0
data/ext/ssv.hpp
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#ifndef __SSV_HPP__
|
2
|
+
#define __SSV_HPP__
|
3
|
+
|
4
|
+
#include <memory.h>
|
5
|
+
#include <vector>
|
6
|
+
#include <cassert>
|
7
|
+
#include <stdio.h>
|
8
|
+
|
9
|
+
namespace tx_tool{
|
10
|
+
|
11
|
+
typedef unsigned int uint; // 32bit
|
12
|
+
typedef unsigned short ushort; // 16bit
|
13
|
+
typedef unsigned char uchar; // 8bit
|
14
|
+
|
15
|
+
|
16
|
+
#define SSV_BLOCK_SHIFT (5)
|
17
|
+
#define SSV_BLOCK (1U << SSV_BLOCK_SHIFT)
|
18
|
+
|
19
|
+
#define SSV_LBLOCK_SHIFT (8)
|
20
|
+
#define SSV_LBLOCK (1U << SSV_LBLOCK_SHIFT)
|
21
|
+
#define SSV_MBLOCK_SHIFT (5)
|
22
|
+
#define SSV_MBLOCK (1U << SSV_MBLOCK_SHIFT)
|
23
|
+
|
24
|
+
#define logLL (16)
|
25
|
+
#define LL (1U << logLL)
|
26
|
+
#define logLLL (5)
|
27
|
+
#define LLL (1U << logLLL)
|
28
|
+
#define logL (logLL-1-5)
|
29
|
+
#define L (1U << logL)
|
30
|
+
|
31
|
+
class ssv{
|
32
|
+
|
33
|
+
public:
|
34
|
+
ssv(const uint _size = 0);
|
35
|
+
ssv(std::vector<bool>& bv);
|
36
|
+
|
37
|
+
int resize(const uint _size);
|
38
|
+
|
39
|
+
void free();
|
40
|
+
|
41
|
+
~ssv();
|
42
|
+
|
43
|
+
inline uint getBit(const uint pos) const {
|
44
|
+
return (B[pos / SSV_BLOCK] >> (pos % SSV_BLOCK)) & 1;
|
45
|
+
}
|
46
|
+
|
47
|
+
uint getBits(const uint pos, uint width) const;
|
48
|
+
void setBits(const uint pos, const uint width, const uint x);
|
49
|
+
uint rank(uint pos, const uint bit) const;
|
50
|
+
uint select(uint pos, const uint bit) const;
|
51
|
+
|
52
|
+
void setBit(uint pos, uint x);
|
53
|
+
uint getAllocate() const;
|
54
|
+
|
55
|
+
void build();
|
56
|
+
|
57
|
+
uint rankBuild(const uint t_size); // return oneNum
|
58
|
+
void selectBuild(const uint t_size);
|
59
|
+
|
60
|
+
int write(FILE* outfp);
|
61
|
+
int read(FILE* infp);
|
62
|
+
|
63
|
+
uint getBlock(const uint blockPos) const;
|
64
|
+
void setBlock(const uint blockPos, const uint x);
|
65
|
+
uint getSize() const;
|
66
|
+
uint getBlockSize() const;
|
67
|
+
|
68
|
+
size_t set_array(void* ptr);
|
69
|
+
|
70
|
+
private:
|
71
|
+
uint popCount(uint r) const;
|
72
|
+
uint _rank1(const uint pos) const;
|
73
|
+
uint _select1(uint x) const;
|
74
|
+
uint _select0(uint x) const;
|
75
|
+
uint* B;
|
76
|
+
uint size;
|
77
|
+
uint oneNum;
|
78
|
+
uint blockSize; // (size+SSV_BLOCK-1)/SSV_BLOCKSZIE
|
79
|
+
|
80
|
+
uint LBlockSize;
|
81
|
+
uint MBlockSize;
|
82
|
+
|
83
|
+
// for rank
|
84
|
+
uint* levelL;
|
85
|
+
uchar* levelM;
|
86
|
+
|
87
|
+
bool isBuild;
|
88
|
+
bool no_delete;
|
89
|
+
};
|
90
|
+
|
91
|
+
}
|
92
|
+
|
93
|
+
#endif
|
data/ext/swig.patch
ADDED
@@ -0,0 +1,192 @@
|
|
1
|
+
Index: Lib/ruby/rubycontainer.swg
|
2
|
+
===================================================================
|
3
|
+
--- Lib/ruby/rubycontainer.swg (revision 11423)
|
4
|
+
+++ Lib/ruby/rubycontainer.swg (working copy)
|
5
|
+
@@ -446,11 +446,10 @@
|
6
|
+
%typemap(out,noblock=1,fragment="RubySequence_Cont")
|
7
|
+
std::pair<const_iterator, const_iterator> {
|
8
|
+
$result = rb_ary_new2(2);
|
9
|
+
- RARRAY_PTR($result)[0] = SWIG_NewPointerObj(swig::make_const_iterator(%static_cast($1,const $type &).first),
|
10
|
+
- swig::ConstIterator::descriptor(),SWIG_POINTER_OWN);
|
11
|
+
- RARRAY_PTR($result)[1] = SWIG_NewPointerObj(swig::make_const_iterator(%static_cast($1,const $type &).second),
|
12
|
+
- swig::ConstIterator::descriptor(),SWIG_POINTER_OWN);
|
13
|
+
- RARRAY_LEN($result) = 2;
|
14
|
+
+ rb_ary_push($result, SWIG_NewPointerObj(swig::make_const_iterator(%static_cast($1,const $type &).first),
|
15
|
+
+ swig::ConstIterator::descriptor(),SWIG_POINTER_OWN));
|
16
|
+
+ rb_ary_push($result, SWIG_NewPointerObj(swig::make_const_iterator(%static_cast($1,const $type &).second),
|
17
|
+
+ swig::ConstIterator::descriptor(),SWIG_POINTER_OWN));
|
18
|
+
}
|
19
|
+
|
20
|
+
// std::map/multimap/set allow returning std::pair< iterator, iterator > from
|
21
|
+
@@ -459,11 +458,10 @@
|
22
|
+
%typemap(out,noblock=1,fragment="RubySequence_Cont")
|
23
|
+
std::pair<iterator, iterator> {
|
24
|
+
$result = rb_ary_new2(2);
|
25
|
+
- RARRAY_PTR($result)[0] = SWIG_NewPointerObj(swig::make_const_iterator(%static_cast($1,const $type &).first),
|
26
|
+
- swig::ConstIterator::descriptor(),SWIG_POINTER_OWN);
|
27
|
+
- RARRAY_PTR($result)[1] = SWIG_NewPointerObj(swig::make_const_iterator(%static_cast($1,const $type &).second),
|
28
|
+
- swig::ConstIterator::descriptor(),SWIG_POINTER_OWN);
|
29
|
+
- RARRAY_LEN($result) = 2;
|
30
|
+
+ rb_ary_push($result, SWIG_NewPointerObj(swig::make_const_iterator(%static_cast($1,const $type &).first),
|
31
|
+
+ swig::ConstIterator::descriptor(),SWIG_POINTER_OWN));
|
32
|
+
+ rb_ary_push($result, SWIG_NewPointerObj(swig::make_const_iterator(%static_cast($1,const $type &).second),
|
33
|
+
+ swig::ConstIterator::descriptor(),SWIG_POINTER_OWN));
|
34
|
+
}
|
35
|
+
|
36
|
+
|
37
|
+
@@ -566,7 +564,8 @@
|
38
|
+
{
|
39
|
+
Sequence::const_iterator i = $self->begin();
|
40
|
+
Sequence::const_iterator e = $self->end();
|
41
|
+
- VALUE str = rb_str_new2( swig::type_name< Sequence >() );
|
42
|
+
+ // Double parentheses to avoid macro expansion error in case Sequence contains comma.
|
43
|
+
+ VALUE str = rb_str_new2(( swig::type_name< Sequence >() ));
|
44
|
+
str = rb_str_cat2( str, " [" );
|
45
|
+
bool comma = false;
|
46
|
+
VALUE tmp;
|
47
|
+
@@ -1112,9 +1111,8 @@
|
48
|
+
int i = 0;
|
49
|
+
for (const_iterator it = seq.begin();
|
50
|
+
it != seq.end(); ++it, ++i) {
|
51
|
+
- RARRAY_PTR(obj)[i] = swig::from< value_type >(*it);
|
52
|
+
+ rb_ary_push(obj, swig::from< value_type >(*it));
|
53
|
+
}
|
54
|
+
- RARRAY_LEN(obj) = size;
|
55
|
+
rb_obj_freeze(obj); // treat as immutable result
|
56
|
+
return obj;
|
57
|
+
} else {
|
58
|
+
Index: Lib/ruby/std_set.i
|
59
|
+
===================================================================
|
60
|
+
--- Lib/ruby/std_set.i (revision 11423)
|
61
|
+
+++ Lib/ruby/std_set.i (working copy)
|
62
|
+
@@ -170,10 +170,9 @@
|
63
|
+
%typemap(out,noblock=1,fragment="RubyPairBoolOutputIterator")
|
64
|
+
std::pair<iterator, bool> {
|
65
|
+
$result = rb_ary_new2(2);
|
66
|
+
- RARRAY_PTR($result)[0] = SWIG_NewPointerObj(swig::make_set_nonconst_iterator(%static_cast($1,$type &).first),
|
67
|
+
- swig::Iterator::descriptor(),SWIG_POINTER_OWN);
|
68
|
+
- RARRAY_PTR($result)[1] = SWIG_From(bool)(%static_cast($1,const $type &).second);
|
69
|
+
- RARRAY_LEN($result) = 2;
|
70
|
+
+ rb_ary_push($result, SWIG_NewPointerObj(swig::make_set_nonconst_iterator(%static_cast($1,$type &).first),
|
71
|
+
+ swig::Iterator::descriptor(),SWIG_POINTER_OWN));
|
72
|
+
+ rb_ary_push($result, SWIG_From(bool)(%static_cast($1,const $type &).second));
|
73
|
+
}
|
74
|
+
|
75
|
+
%extend {
|
76
|
+
Index: Lib/ruby/std_multimap.i
|
77
|
+
===================================================================
|
78
|
+
--- Lib/ruby/std_multimap.i (revision 11423)
|
79
|
+
+++ Lib/ruby/std_multimap.i (working copy)
|
80
|
+
@@ -115,7 +115,8 @@
|
81
|
+
{
|
82
|
+
MultiMap::iterator i = $self->begin();
|
83
|
+
MultiMap::iterator e = $self->end();
|
84
|
+
- VALUE str = rb_str_new2( swig::type_name< MultiMap >() );
|
85
|
+
+ // Double parentheses to avoid macro expansion error in case MultiMap contains comma.
|
86
|
+
+ VALUE str = rb_str_new2(( swig::type_name< MultiMap >() ));
|
87
|
+
str = rb_str_cat2( str, " {" );
|
88
|
+
VALUE tmp;
|
89
|
+
while ( i != e )
|
90
|
+
Index: Lib/ruby/std_pair.i
|
91
|
+
===================================================================
|
92
|
+
--- Lib/ruby/std_pair.i (revision 11423)
|
93
|
+
+++ Lib/ruby/std_pair.i (working copy)
|
94
|
+
@@ -118,11 +118,9 @@
|
95
|
+
|
96
|
+
static VALUE from(const std::pair<T,U>& val) {
|
97
|
+
VALUE obj = rb_ary_new2(2);
|
98
|
+
- RARRAY_PTR(obj)[0] = swig::from<
|
99
|
+
- typename swig::noconst_traits<T >::noconst_type>(val.first);
|
100
|
+
- RARRAY_PTR(obj)[1] = swig::from(val.second);
|
101
|
+
- RARRAY_LEN(obj) = 2;
|
102
|
+
- rb_define_singleton_method(obj, "second",
|
103
|
+
+ rb_ary_push(obj, swig::from<typename swig::noconst_traits<T >::noconst_type>(val.first));
|
104
|
+
+ rb_ary_push(obj, swig::from(val.second));
|
105
|
+
+ rb_define_singleton_method(obj, "second",
|
106
|
+
VALUEFUNC(_wrap_pair_second), 0 );
|
107
|
+
rb_define_singleton_method(obj, "second=",
|
108
|
+
VALUEFUNC(_wrap_pair_second_eq), 1 );
|
109
|
+
@@ -148,7 +146,8 @@
|
110
|
+
VALUE inspect() const
|
111
|
+
{
|
112
|
+
VALUE tmp;
|
113
|
+
- VALUE str = rb_str_new2( swig::type_name< pair >() );
|
114
|
+
+ // Double parentheses to avoid macro expansion error in case pair contains comma.
|
115
|
+
+ VALUE str = rb_str_new2(( swig::type_name< pair >() ));
|
116
|
+
str = rb_str_cat2( str, " (" );
|
117
|
+
tmp = swig::from( $self->first );
|
118
|
+
tmp = rb_obj_as_string( tmp );
|
119
|
+
Index: Lib/ruby/rubyclasses.swg
|
120
|
+
===================================================================
|
121
|
+
--- Lib/ruby/rubyclasses.swg (revision 11423)
|
122
|
+
+++ Lib/ruby/rubyclasses.swg (working copy)
|
123
|
+
@@ -315,30 +315,36 @@
|
124
|
+
|
125
|
+
};
|
126
|
+
|
127
|
+
- ID GC_VALUE::hash_id = rb_intern("hash");
|
128
|
+
- ID GC_VALUE::lt_id = rb_intern("<");
|
129
|
+
- ID GC_VALUE::gt_id = rb_intern(">");
|
130
|
+
- ID GC_VALUE::eq_id = rb_intern("==");
|
131
|
+
- ID GC_VALUE::le_id = rb_intern("<=");
|
132
|
+
- ID GC_VALUE::ge_id = rb_intern(">=");
|
133
|
+
+ // We need this because rb_intern macro uses statement-expression and
|
134
|
+
+ // statement-expression is allowed only inside functions.
|
135
|
+
+ static ID rb_intern_wrapper(const char* str) {
|
136
|
+
+ return rb_intern(str);
|
137
|
+
+ }
|
138
|
+
|
139
|
+
- ID GC_VALUE::pos_id = rb_intern("+@");
|
140
|
+
- ID GC_VALUE::neg_id = rb_intern("-@");
|
141
|
+
- ID GC_VALUE::inv_id = rb_intern("~");
|
142
|
+
+ ID GC_VALUE::hash_id = rb_intern_wrapper("hash");
|
143
|
+
+ ID GC_VALUE::lt_id = rb_intern_wrapper("<");
|
144
|
+
+ ID GC_VALUE::gt_id = rb_intern_wrapper(">");
|
145
|
+
+ ID GC_VALUE::eq_id = rb_intern_wrapper("==");
|
146
|
+
+ ID GC_VALUE::le_id = rb_intern_wrapper("<=");
|
147
|
+
+ ID GC_VALUE::ge_id = rb_intern_wrapper(">=");
|
148
|
+
|
149
|
+
- ID GC_VALUE::add_id = rb_intern("+");
|
150
|
+
- ID GC_VALUE::sub_id = rb_intern("-");
|
151
|
+
- ID GC_VALUE::mul_id = rb_intern("*");
|
152
|
+
- ID GC_VALUE::div_id = rb_intern("/");
|
153
|
+
- ID GC_VALUE::mod_id = rb_intern("%");
|
154
|
+
+ ID GC_VALUE::pos_id = rb_intern_wrapper("+@");
|
155
|
+
+ ID GC_VALUE::neg_id = rb_intern_wrapper("-@");
|
156
|
+
+ ID GC_VALUE::inv_id = rb_intern_wrapper("~");
|
157
|
+
|
158
|
+
- ID GC_VALUE::and_id = rb_intern("&");
|
159
|
+
- ID GC_VALUE::or_id = rb_intern("|");
|
160
|
+
- ID GC_VALUE::xor_id = rb_intern("^");
|
161
|
+
+ ID GC_VALUE::add_id = rb_intern_wrapper("+");
|
162
|
+
+ ID GC_VALUE::sub_id = rb_intern_wrapper("-");
|
163
|
+
+ ID GC_VALUE::mul_id = rb_intern_wrapper("*");
|
164
|
+
+ ID GC_VALUE::div_id = rb_intern_wrapper("/");
|
165
|
+
+ ID GC_VALUE::mod_id = rb_intern_wrapper("%");
|
166
|
+
|
167
|
+
- ID GC_VALUE::lshift_id = rb_intern("<<");
|
168
|
+
- ID GC_VALUE::rshift_id = rb_intern(">>");
|
169
|
+
+ ID GC_VALUE::and_id = rb_intern_wrapper("&");
|
170
|
+
+ ID GC_VALUE::or_id = rb_intern_wrapper("|");
|
171
|
+
+ ID GC_VALUE::xor_id = rb_intern_wrapper("^");
|
172
|
+
|
173
|
+
+ ID GC_VALUE::lshift_id = rb_intern_wrapper("<<");
|
174
|
+
+ ID GC_VALUE::rshift_id = rb_intern_wrapper(">>");
|
175
|
+
+
|
176
|
+
VALUE GC_VALUE::_hash = Qnil;
|
177
|
+
|
178
|
+
typedef GC_VALUE LANGUAGE_OBJ;
|
179
|
+
Index: Lib/ruby/std_map.i
|
180
|
+
===================================================================
|
181
|
+
--- Lib/ruby/std_map.i (revision 11423)
|
182
|
+
+++ Lib/ruby/std_map.i (working copy)
|
183
|
+
@@ -345,7 +345,8 @@
|
184
|
+
{
|
185
|
+
Map::const_iterator i = $self->begin();
|
186
|
+
Map::const_iterator e = $self->end();
|
187
|
+
- VALUE str = rb_str_new2( swig::type_name< Map >() );
|
188
|
+
+ // Double parentheses to avoid macro expansion error in case Map contains comma.
|
189
|
+
+ VALUE str = rb_str_new2(( swig::type_name< Map >() ));
|
190
|
+
str = rb_str_cat2( str, " {" );
|
191
|
+
bool comma = false;
|
192
|
+
VALUE tmp;
|
data/ext/tx.cpp
ADDED
@@ -0,0 +1,442 @@
|
|
1
|
+
#include "tx.hpp"
|
2
|
+
|
3
|
+
#include <climits>
|
4
|
+
#include "ssv.hpp"
|
5
|
+
namespace tx_tool{
|
6
|
+
|
7
|
+
uint tx::NOTFOUND = UINT_MAX;
|
8
|
+
|
9
|
+
struct queue_elem{
|
10
|
+
queue_elem(size_t _left, size_t _right, int _depth) :left(_left),right(_right),depth(_depth){}
|
11
|
+
size_t left;
|
12
|
+
size_t right;
|
13
|
+
int depth;
|
14
|
+
};
|
15
|
+
|
16
|
+
tx::tx():edge(NULL), keyNum(0), no_delete(false) {}
|
17
|
+
|
18
|
+
tx::~tx(){
|
19
|
+
if (!no_delete) {
|
20
|
+
delete[] edge;
|
21
|
+
edge = NULL;
|
22
|
+
no_delete = false;
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
int tx::build(std::vector<std::string>& wordList, const char* fileName) {
|
27
|
+
sort(wordList.begin(),wordList.end());
|
28
|
+
const size_t origWordNum = wordList.size();
|
29
|
+
wordList.erase(unique(wordList.begin(),wordList.end()),wordList.end());
|
30
|
+
int keyNum = (int)wordList.size();
|
31
|
+
if (keyNum != origWordNum){
|
32
|
+
resultLog << "shrink word list " << origWordNum << " -> " << keyNum << std::endl;
|
33
|
+
} else {
|
34
|
+
resultLog << "word list " << keyNum << " elements" << std::endl;
|
35
|
+
}
|
36
|
+
|
37
|
+
uint totalSize = 0;
|
38
|
+
for (size_t i = 0; i < wordList.size(); i++){
|
39
|
+
totalSize += (uint)wordList[i].size();
|
40
|
+
}
|
41
|
+
|
42
|
+
FILE* outfp = fopen(fileName,"wb");
|
43
|
+
if (outfp == NULL){
|
44
|
+
errorLog << "cannot open " << fileName << std::endl;
|
45
|
+
return -1;
|
46
|
+
}
|
47
|
+
|
48
|
+
std::queue<queue_elem> q;
|
49
|
+
if (keyNum != 0){
|
50
|
+
q.push(queue_elem(0,keyNum,0));
|
51
|
+
}
|
52
|
+
if (fwrite(&keyNum,sizeof(int),1,outfp) != 1){
|
53
|
+
errorLog << "fwrite error " << std::endl;
|
54
|
+
return -1;
|
55
|
+
}
|
56
|
+
|
57
|
+
std::vector<bool> vb_loud;
|
58
|
+
std::vector<bool> vb_terminal;
|
59
|
+
|
60
|
+
vb_loud.push_back(0); // super root
|
61
|
+
vb_loud.push_back(1);
|
62
|
+
|
63
|
+
uint nodeNum = 0;
|
64
|
+
|
65
|
+
while (!q.empty()){
|
66
|
+
queue_elem& elem = q.front();
|
67
|
+
const int depth = elem.depth;
|
68
|
+
const size_t left = elem.left;
|
69
|
+
const size_t right = elem.right;
|
70
|
+
q.pop();
|
71
|
+
|
72
|
+
nodeNum++;
|
73
|
+
size_t newLeft = left;
|
74
|
+
if (wordList[left].size() == depth){
|
75
|
+
vb_terminal.push_back(1); // this node has terminate
|
76
|
+
newLeft++;
|
77
|
+
if (newLeft == right){
|
78
|
+
vb_loud.push_back(1);
|
79
|
+
continue;
|
80
|
+
}
|
81
|
+
} else {
|
82
|
+
vb_terminal.push_back(0);
|
83
|
+
}
|
84
|
+
size_t prev = newLeft;
|
85
|
+
char prev_c = wordList[prev][depth];
|
86
|
+
for (size_t i = newLeft+1; i < right; i++){
|
87
|
+
if (prev_c != wordList[i][depth]){
|
88
|
+
fputc(prev_c,outfp);
|
89
|
+
vb_loud.push_back(0);
|
90
|
+
q.push(queue_elem(prev,i,depth+1));
|
91
|
+
prev = i;
|
92
|
+
prev_c = wordList[prev][depth];
|
93
|
+
}
|
94
|
+
}
|
95
|
+
if (prev != right){
|
96
|
+
fputc(prev_c,outfp);
|
97
|
+
vb_loud.push_back(0);
|
98
|
+
q.push(queue_elem(prev,right,depth+1));
|
99
|
+
}
|
100
|
+
vb_loud.push_back(1);
|
101
|
+
}
|
102
|
+
|
103
|
+
{
|
104
|
+
ssv sv(vb_loud);
|
105
|
+
sv.build();
|
106
|
+
if (sv.write(outfp) == -1){
|
107
|
+
errorLog << "fwrite error " << std::endl;
|
108
|
+
return -1;
|
109
|
+
}
|
110
|
+
}
|
111
|
+
{
|
112
|
+
ssv sv(vb_terminal);
|
113
|
+
sv.build();
|
114
|
+
if (sv.write(outfp) == -1){
|
115
|
+
errorLog << "fwrite error " << std::endl;
|
116
|
+
return -1;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
if (fwrite(&nodeNum,sizeof(int),1,outfp) != 1){
|
121
|
+
errorLog << "fwrite error " << std::endl;
|
122
|
+
return -1;
|
123
|
+
}
|
124
|
+
|
125
|
+
size_t outfpSize = ftell(outfp);
|
126
|
+
resultLog << "outputSize:" << outfpSize << " inputSize:" << totalSize << " ratio:" << (float)outfpSize/totalSize << std::endl;
|
127
|
+
if (outfp) fclose(outfp);
|
128
|
+
return 0;
|
129
|
+
}
|
130
|
+
|
131
|
+
int tx::read(const char* fileName){
|
132
|
+
FILE* infp = fopen(fileName,"rb");
|
133
|
+
if (infp == NULL){
|
134
|
+
errorLog << "cannot open " << fileName << std::endl;
|
135
|
+
return -1;
|
136
|
+
}
|
137
|
+
|
138
|
+
keyNum = 0;
|
139
|
+
if (fread(&keyNum,sizeof(int),1,infp) != 1){
|
140
|
+
errorLog << "keyNum read error" << std::endl;
|
141
|
+
fclose(infp);
|
142
|
+
return -1;
|
143
|
+
}
|
144
|
+
|
145
|
+
fseek(infp,0,SEEK_END);
|
146
|
+
size_t fileSize = ftell(infp);
|
147
|
+
if (fseek(infp,fileSize-(1*sizeof(int)),SEEK_SET) == -1){
|
148
|
+
errorLog << "fseek error" << std::endl;
|
149
|
+
fclose(infp);
|
150
|
+
return -1;
|
151
|
+
}
|
152
|
+
|
153
|
+
int nodeNum = -1;
|
154
|
+
if (fread(&nodeNum,sizeof(int),1,infp) != 1){
|
155
|
+
errorLog << "nodeNum read error" << std::endl;
|
156
|
+
fclose(infp);
|
157
|
+
return -1;
|
158
|
+
}
|
159
|
+
resultLog << "keyNum:" << (int)keyNum << " nodeNum:" << nodeNum << std::endl;
|
160
|
+
|
161
|
+
if (fseek(infp,sizeof(int)*1,SEEK_SET) == -1){
|
162
|
+
errorLog << "fseek error" << std::endl;
|
163
|
+
fclose(infp);
|
164
|
+
return -1;
|
165
|
+
}
|
166
|
+
|
167
|
+
if (nodeNum > 0){
|
168
|
+
edge = new char [nodeNum-1];
|
169
|
+
if (fread(edge,sizeof(char),nodeNum-1,infp) != nodeNum-1){
|
170
|
+
errorLog << "fseek error" << std::endl;
|
171
|
+
fclose(infp);
|
172
|
+
return -1;
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
loud.read(infp);
|
177
|
+
terminal.read(infp);
|
178
|
+
|
179
|
+
if (infp) fclose(infp);
|
180
|
+
return 0;
|
181
|
+
}
|
182
|
+
|
183
|
+
int tx::setArray(void* ptr, size_t readSize){
|
184
|
+
keyNum = *(uint*)(ptr);
|
185
|
+
printf("keyNum:%d\n", keyNum);
|
186
|
+
int nodeNum = *(uint*)((uchar*)ptr+readSize-sizeof(uint));
|
187
|
+
printf("nodeNum:%d\n", nodeNum);
|
188
|
+
edge = (char*)ptr + sizeof(uint);
|
189
|
+
size_t readNum = loud.set_array((void*)((uchar*)ptr + sizeof(uint) + nodeNum - 1));
|
190
|
+
size_t readNum2 = terminal.set_array((void*)((uchar*)ptr + sizeof(uint) + nodeNum - 1 + readNum));
|
191
|
+
if (sizeof(uint) + nodeNum + - 1 + readNum + readNum2 + sizeof(uint) != readSize){
|
192
|
+
errorLog << "setArray error" << std::endl;
|
193
|
+
return -1;
|
194
|
+
}
|
195
|
+
no_delete = true;
|
196
|
+
return 0;
|
197
|
+
}
|
198
|
+
|
199
|
+
uint tx::prefixSearch(const char* str, const size_t len, size_t& retLen, bool matchPrefix) const {
|
200
|
+
uint curPos = 2;
|
201
|
+
uint retId = NOTFOUND;
|
202
|
+
retLen = NOTFOUND;
|
203
|
+
if (terminal.getSize() <= 2) return retId;
|
204
|
+
|
205
|
+
for (size_t i = 0 ; ; i++){
|
206
|
+
const uint nodeId = loud.rank(curPos-1,1)-1;
|
207
|
+
if (terminal.getBit(nodeId)){
|
208
|
+
retLen = i;
|
209
|
+
retId = terminal.rank(nodeId,1)-1;
|
210
|
+
} else if (matchPrefix){
|
211
|
+
retLen = i;
|
212
|
+
retId = NOTFOUND;
|
213
|
+
}
|
214
|
+
if (i == len) break;
|
215
|
+
uint nextPos = getChild(curPos,str[i]);
|
216
|
+
if (nextPos == UINT_MAX){
|
217
|
+
break;
|
218
|
+
}
|
219
|
+
curPos = nextPos;
|
220
|
+
}
|
221
|
+
return retId;
|
222
|
+
}
|
223
|
+
|
224
|
+
uint tx::expandSearch(const char* str, const size_t len, std::vector<std::string>& ret, const uint limit) const {
|
225
|
+
ret.clear();
|
226
|
+
if (limit == 0) return 0;
|
227
|
+
if (terminal.getSize() <= 2) return 0;
|
228
|
+
|
229
|
+
bool prefix = false;
|
230
|
+
uint curPos = 2;
|
231
|
+
for (size_t i = 0; i < len; i++){
|
232
|
+
uint nextPos = getChild(curPos,str[i]);
|
233
|
+
const uint nodeId = loud.rank(curPos-1,1)-1;
|
234
|
+
if (terminal.getBit(nodeId)){
|
235
|
+
ret.push_back(std::string(str,str+i));
|
236
|
+
}
|
237
|
+
|
238
|
+
if (nextPos == UINT_MAX){
|
239
|
+
prefix = true;
|
240
|
+
break;
|
241
|
+
}
|
242
|
+
curPos = nextPos;
|
243
|
+
}
|
244
|
+
|
245
|
+
if (!prefix){
|
246
|
+
std::string curStr(str, len);
|
247
|
+
std::vector<std::pair<size_t, std::pair<std::string, uint> > > ret_p;
|
248
|
+
enumerateAll(curPos,curStr,ret_p);
|
249
|
+
sort(ret_p.begin(),ret_p.end());
|
250
|
+
for (size_t i = 0; i < ret_p.size() && i < limit; i++){
|
251
|
+
ret.push_back(ret_p[i].second.first);
|
252
|
+
}
|
253
|
+
}
|
254
|
+
return (uint)ret.size();
|
255
|
+
}
|
256
|
+
|
257
|
+
uint tx::commonPrefixSearch(const char* str, const size_t len, std::vector<std::string>& ret, std::vector<uint>& retID, const uint limit) const{
|
258
|
+
ret.clear();
|
259
|
+
retID.clear();
|
260
|
+
if (limit == 0) return 0;
|
261
|
+
if (terminal.getSize() <= 2) return 0;
|
262
|
+
|
263
|
+
uint curPos = 2;
|
264
|
+
|
265
|
+
for (size_t i = 0; ; i++){
|
266
|
+
const uint nodeId = loud.rank(curPos-1,1)-1;
|
267
|
+
if (terminal.getBit(nodeId)){
|
268
|
+
ret.push_back(std::string(str, str+i));
|
269
|
+
retID.push_back(terminal.rank(nodeId,1)-1);
|
270
|
+
if (ret.size() == limit) break;
|
271
|
+
}
|
272
|
+
if (i == len) break;
|
273
|
+
|
274
|
+
uint nextPos = getChild(curPos,str[i]);
|
275
|
+
if (nextPos == UINT_MAX){
|
276
|
+
break;
|
277
|
+
}
|
278
|
+
curPos = nextPos;
|
279
|
+
}
|
280
|
+
return (uint)ret.size();
|
281
|
+
}
|
282
|
+
|
283
|
+
uint tx::commonPrefixSearch(const char* str, const size_t len, std::vector<uint>& retLen, std::vector<uint>& retID, const uint limit) const{
|
284
|
+
retLen.clear();
|
285
|
+
retID.clear();
|
286
|
+
if (limit == 0) return 0;
|
287
|
+
if (terminal.getSize() <= 2) return 0;
|
288
|
+
|
289
|
+
uint curPos = 2;
|
290
|
+
for (size_t i = 0; ; i++){
|
291
|
+
const uint nodeId = loud.rank(curPos-1,1)-1;
|
292
|
+
if (terminal.getBit(nodeId)){
|
293
|
+
retLen.push_back(i);
|
294
|
+
retID.push_back(terminal.rank(nodeId,1)-1);
|
295
|
+
if (retLen.size() == limit) break;
|
296
|
+
}
|
297
|
+
if (i == len) break;
|
298
|
+
|
299
|
+
uint nextPos = getChild(curPos,str[i]);
|
300
|
+
if (nextPos == UINT_MAX){
|
301
|
+
break;
|
302
|
+
}
|
303
|
+
curPos = nextPos;
|
304
|
+
}
|
305
|
+
|
306
|
+
return (uint)retLen.size();
|
307
|
+
}
|
308
|
+
|
309
|
+
|
310
|
+
uint tx::predictiveSearch(const char* str, const size_t len, std::vector<std::string>& ret, std::vector<uint>& retID, const uint limit) const{
|
311
|
+
ret.clear();
|
312
|
+
retID.clear();
|
313
|
+
if (limit == 0) return 0;
|
314
|
+
if (terminal.getSize() <= 2) return 0;
|
315
|
+
|
316
|
+
bool prefix = false;
|
317
|
+
uint curPos = 2;
|
318
|
+
for (size_t i = 0; i < len; i++){
|
319
|
+
uint nextPos = getChild(curPos,str[i]);
|
320
|
+
if (nextPos == UINT_MAX){
|
321
|
+
prefix = true;
|
322
|
+
break;
|
323
|
+
}
|
324
|
+
curPos = nextPos;
|
325
|
+
}
|
326
|
+
|
327
|
+
if (!prefix){
|
328
|
+
std::string curStr(str, len);
|
329
|
+
std::vector<std::pair<size_t, std::pair<std::string, uint> > > ret_p;
|
330
|
+
enumerateAll(curPos, curStr, ret_p);
|
331
|
+
sort(ret_p.begin(),ret_p.end());
|
332
|
+
for (size_t i = 0; i < ret_p.size() && i < limit; i++){
|
333
|
+
ret.push_back(ret_p[i].second.first);
|
334
|
+
retID.push_back(ret_p[i].second.second);
|
335
|
+
}
|
336
|
+
}
|
337
|
+
return (uint)ret.size();
|
338
|
+
}
|
339
|
+
|
340
|
+
uint tx::predictiveSearch(const char* str, const size_t len, std::vector<uint>& retLen, std::vector<uint>& retID, const uint limit) const{
|
341
|
+
retLen.clear();
|
342
|
+
retID.clear();
|
343
|
+
if (limit == 0) return 0;
|
344
|
+
if (terminal.getSize() <= 2) return 0;
|
345
|
+
|
346
|
+
bool prefix = false;
|
347
|
+
uint curPos = 2;
|
348
|
+
for (size_t i = 0; i < len; i++){
|
349
|
+
uint nextPos = getChild(curPos,str[i]);
|
350
|
+
if (nextPos == UINT_MAX){
|
351
|
+
prefix = true;
|
352
|
+
break;
|
353
|
+
}
|
354
|
+
curPos = nextPos;
|
355
|
+
}
|
356
|
+
|
357
|
+
if (!prefix){
|
358
|
+
std::string curStr(str, len);
|
359
|
+
std::vector<std::pair<size_t, std::pair<std::string, uint> > > ret_p;
|
360
|
+
enumerateAll(curPos, curStr, ret_p);
|
361
|
+
sort(ret_p.begin(),ret_p.end());
|
362
|
+
for (size_t i = 0; i < ret_p.size() && i < limit; i++){
|
363
|
+
retLen.push_back(ret_p[i].second.first.size());
|
364
|
+
retID.push_back(ret_p[i].second.second);
|
365
|
+
}
|
366
|
+
}
|
367
|
+
return (uint)retLen.size();
|
368
|
+
}
|
369
|
+
|
370
|
+
|
371
|
+
|
372
|
+
void tx::enumerateAll(const uint pos, const std::string str, std::vector<std::pair<size_t, std::pair<std::string, uint> > >& ret) const{
|
373
|
+
const uint nodeId = loud.rank(pos-1,1)-1;
|
374
|
+
if (terminal.getBit(nodeId)){
|
375
|
+
std::pair<std::string, uint> tmp(str, terminal.rank(nodeId,1)-1);
|
376
|
+
ret.push_back(std::make_pair<size_t, std::pair<std::string, uint> >(str.size(), tmp));
|
377
|
+
}
|
378
|
+
|
379
|
+
uint curPos = pos;
|
380
|
+
uint edgePos = loud.rank(pos,0)-2;
|
381
|
+
while (loud.getBit(curPos) == 0){
|
382
|
+
const uint nextPos = loud.select(loud.rank(curPos,0),1)+1;
|
383
|
+
enumerateAll(nextPos,str + edge[edgePos],ret);
|
384
|
+
curPos++;
|
385
|
+
edgePos++;
|
386
|
+
}
|
387
|
+
}
|
388
|
+
|
389
|
+
uint tx::getChild(const uint pos, const char c) const{
|
390
|
+
uint curPos = pos;
|
391
|
+
uint edgePos = loud.rank(pos,0)-2;
|
392
|
+
for (;;){
|
393
|
+
if (loud.getBit(curPos) == 1) {
|
394
|
+
curPos = UINT_MAX;
|
395
|
+
return curPos;
|
396
|
+
}
|
397
|
+
if (edge[edgePos] == c){
|
398
|
+
uint nextPos = loud.select(loud.rank(curPos,0),1)+1;
|
399
|
+
return nextPos;
|
400
|
+
}
|
401
|
+
curPos++;
|
402
|
+
edgePos++;
|
403
|
+
}
|
404
|
+
}
|
405
|
+
|
406
|
+
uint tx::getParent(const uint pos, char& c) const{
|
407
|
+
c = edge[loud.rank(pos,0)-2];
|
408
|
+
return loud.select(loud.rank(pos-1, 1), 0);
|
409
|
+
}
|
410
|
+
|
411
|
+
|
412
|
+
uint tx::reverseLookup(const uint id, std::string& ret) const {
|
413
|
+
ret.clear();
|
414
|
+
if (id >= keyNum) return 0;
|
415
|
+
if (terminal.getSize() <= 2) return 0;
|
416
|
+
|
417
|
+
const uint nodeId = terminal.select(id + 1, 1);
|
418
|
+
char unused_c = 0;
|
419
|
+
uint curPos = getParent(loud.select(nodeId+1,1)+1, unused_c);
|
420
|
+
while (curPos >= 2){
|
421
|
+
char c = 0;
|
422
|
+
curPos = getParent(curPos, c);
|
423
|
+
ret += c;
|
424
|
+
}
|
425
|
+
reverse(ret.begin(), ret.end());
|
426
|
+
return ret.size();
|
427
|
+
}
|
428
|
+
|
429
|
+
std::string tx::getResultLog() const {
|
430
|
+
return resultLog.str();
|
431
|
+
}
|
432
|
+
|
433
|
+
std::string tx::getErrorLog() const{
|
434
|
+
return errorLog.str();
|
435
|
+
}
|
436
|
+
|
437
|
+
uint tx::getKeyNum() const {
|
438
|
+
return keyNum;
|
439
|
+
}
|
440
|
+
|
441
|
+
|
442
|
+
} // namespace tx_tool
|